In [1]:
# Download the CSV File
import pandas as pd
music_data = pd.read_csv('music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [2]:
# The second step is cleaning or preparing the Data
X = music_data.drop(columns=['genre'])
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [3]:
Y = music_data['genre']
Y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [4]:
# next step is to build a model using Machine learning algorithm
# There is many Machine Learning algorithms out there. The good news, we don´t have to explicity program those algorithms, 
# they are all implemented for us in a library called: sklearn.tree 
# In our case,Decision Tree Algorithm is used.

from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model.fit(X,Y)
predictions = model.predict([[21,1],[22,0]])
predictions



array(['HipHop', 'Dance'], dtype=object)

In [5]:
# after we build a model we need to measure its accuracy,and
# if it is not accurate enough we should modify it or build the model using different algorithm, 
# so next step is: Calculating the Accuracy, showing how to measure the accuracy of a model 
# In order to do so, we split the data into two sets: one for Training and the other for Testing


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train,Y_train)
predictions = model.predict(X_test)

score = accuracy_score(Y_test, predictions)
score




1.0

In [6]:
#test_size=0.2 means: test data size is 20% and training data size is 80%
len(X_test),len(Y_test)

(4, 4)

In [7]:
len(X_train),len(Y_train)

(14, 14)

In [8]:
# To simplify things, I have removed all the codes for calculating the accuracy because now i will focus in a different objectif which is:
#find a persisting model
#because training a model can be sometimes really time consuming,
#in real application we have a data sets with thousands or millions  of samples, and training a model for that can take seconds,minutes or even hours, this is why model persisting is important


import pandas as pd
from sklearn.tree import DecisionTreeClassifier
import joblib

# music_data = pd.read_csv('music.csv')
# X = music_data.drop(columns=['genre'])
# Y = music_data['genre']

# model = DecisionTreeClassifier()
# model.fit(X,Y)

model = joblib.load('music-recommender.joblib')
predictions = model.predict([[21,1]])
predictions

array(['HipHop'], dtype=object)

In [9]:
# Visualizing DecisionTrees
# now i will export my model in a visual format, so we will see how this model  makes predictions
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree #this object has a method for exporting our DecisionTree in a grafical format

music_data = pd.read_csv('music.csv') #import my data
X = music_data.drop(columns=['genre'])#creat input data 
Y = music_data['genre']               #create output data 

model = DecisionTreeClassifier()      #create a model
model.fit(X,Y)                        #train the model

tree.export_graphviz(model, out_file='music-recommender.dot', 
                    feature_names=['age', 'gender'],
                    class_names=sorted(Y.unique()),
                    label='all',
                    rounded=True,
                    filled=True)
                     


