In [1]:
# Mosh's Step-by-Step Guide to Machine Learning:
# (1) Import the data (typically a .csv file)
# (2) Clean the data (e.g., removing duplicated, irrelevant, and/or incomplete data)
# (3) Split data into TWO sets——one for training (80%), and one for testing (20%)——to ensure model produces intended results
# (4) Build a ML model using ML algorithms (e.g., decision trees, neural networks, ...)
# (5) Train the model (with training data)
# (6) Test the Model (with testing data)
# (7) Evaluate and improve the model using its predictions from (6) (e.g., different algorithm, fine-tune parameters, ...)

# {1} Model Persistence: Trained models can be saved to files, then uploaded later for use w/o retraining it
# {2} 

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split # Using split function from sci-kit learn library for (3)
from sklearn.tree import DecisionTreeClassifier # Using the decision tree algorithm from sci-kit learn library for (4)
from sklearn.metrics import accuracy_score # Calculates accuracy for (7)

import joblib # joblib has a range of methods for saving and loading pre-existing models for {1}

from sklearn import tree # Contains methods for visualizing decision trees

In [3]:
music_data = pd.read_csv("music.csv") # (1)
music_data
# Skipping (2) b/c not needed in this scenario

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


In [4]:
# Let "age" and "gender" be the input sets (what model uses; X), and "genre" be the output set (what model predicts; y) (3)
X = music_data.drop(columns=["genre"]) # Creates new data set w/o "genre" column w/o affecting original data 
y = music_data["genre"]
# Split function returns a tuple, so we can unpack into several variables ({2} 2:15:34)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Allocating 20% of original data for testing; (3)

X, y

(    age  gender
 0    20       1
 1    23       1
 2    25       1
 3    26       1
 4    29       1
 5    30       1
 6    31       1
 7    33       1
 8    37       1
 9    20       0
 10   21       0
 11   25       0
 12   26       0
 13   27       0
 14   30       0
 15   31       0
 16   34       0
 17   35       0,
 0        HipHop
 1        HipHop
 2        HipHop
 3          Jazz
 4          Jazz
 5          Jazz
 6     Classical
 7     Classical
 8     Classical
 9         Dance
 10        Dance
 11        Dance
 12     Acoustic
 13     Acoustic
 14     Acoustic
 15    Classical
 16    Classical
 17    Classical
 Name: genre, dtype: object)

In [9]:
model = DecisionTreeClassifier() # (4) 
model.fit(X_train.values, y_train.values) # Input (X) and output (y) sets; if ".values" not appended, then UserWarning; (5)
# predictions = model.predict([[21, 1], [22, 0]]) # [21, 1] = 21-year-old male; [22, 0] = 22-year-old female; (6)
predictions = model.predict(X_test.values)
predictions # Model correctly guesses that 21-year-old male should like "HipHop" and 22-year-old female should like "Dance"

array(['Classical', 'Classical', 'Classical', 'Jazz'], dtype=object)

In [6]:
score = accuracy_score(y_test, predictions) # Compares expected (actual) values from data set with model's predicted values
score # Returns number between 0 and 1 inclusive
# Greater quality and quantity of data = Greater accuracy
# More training data over testing data = Greater accuracy

1.0

In [7]:
# {1}
joblib.dump(model, "music-recommender.joblib") # Download model and give a file name
old_model = joblib.load("music-recommender.joblib") # Upload dumped model back onto Jupyter Notebook

In [8]:
# {2}
tree.export_graphviz(model, out_file="music-recommender.dot", # DOT is a graph description language 
                    feature_names=["age", "gender"],  # Decisive rules of nodes 
                    # Values repeat and are unalphabetized, so use sorted() and unique()
                     class_names=sorted(y.unique()), # Classes = output values, so let = y 
                     label="all", #
                     rounded="True", # Gives nodes rounded corners 
                     filled="True") # Fills nodes with colour
