# Steps:
1. import the data
2. clean the data
3. split the data into Training/Test Sets
4. Create the model
5. Train the Model
6. Make Predictions
7. Evaluate & Improve

In [39]:
import pandas as pd

# algorithm
from sklearn.tree import DecisionTreeClassifier

# persisting model -> trained model
import joblib

# object has method for exporting tree as graph
from sklearn import tree

# split data into sets (training & testing)
from sklearn.model_selection import train_test_split

# compare predictions with the values (tests/outputs) we have
from sklearn.metrics import accuracy_score

music_data = pd.read_csv('music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


# preparing/cleaning the data

In [26]:
# remove the output/prediction
X = music_data.drop(columns=['genre'])

# 1 -> male, 0 -> female
X

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [27]:
y = music_data['genre']
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

In [28]:
# allocated 20% of data for testing

# this fuinction returns a tuple
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# build model & algorithm

In [35]:
# use decision tree for algorithm

# create instance of the decision tree classifier
model = DecisionTreeClassifier()

# train it to learn patterns in the data
model.fit(X, y)
model.fit(X_train, y_train)

# saving/loading models
# persist model
joblib.dump(model, 'music-recommender.joblib')

['music-recommender.joblib']

In [36]:
model = joblib.load('music-recommender.joblib')
predictions = model.predict([[21, 1]])
predictions

array(['HipHop'], dtype=object)

In [37]:
# make predictions

# 21 years old, male
# 22 years old, female
# predictions = model.predict([ [21, 1], [22, 0] ])

predictions = model.predict(X_test)

predictions

array(['Jazz', 'Acoustic', 'Dance', 'HipHop'], dtype=object)

# Calculate the Accuracy

In [38]:
# first split data, Training & Testing

# allocate 70% - 80% of data for Training, 20% - 30% for Testing

score = accuracy_score(y_test, predictions)
score

0.75

# Visualizing a decision tree

In [40]:
tree.export_graphviz(model, out_file='music-recommender.dot',
                     feature_names=['age', 'gender'],
                     class_names=sorted(y.unique()), label='all', rounded=True, filled=True)