### A Simple Real World Problem Statement
##### Given a dataset with people's favourite music genre, their age and gender, we use a ML model to predict a person's likely music genre based on the age and gender

#### Loading the dataset


In [2]:
import pandas as pd
m = pd.read_csv('music.csv')
m

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


#### Preparing the dataset

In [5]:
#Since we have no null or NA values, we needn't clean anything
#But we need to split the data into input set and output set
#input dataset
x = m.drop(columns=['genre'])
#this method doesn't modify the current dataset, but creates a new dataset without the mentioned column
x
#output dataset
y = m['genre']
#creates another dataset with only the mentioned column
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

#### Building the Model
##### Here we use Decision Tree Algorithm for predicting the required genre of music for given age and gender

In [6]:
from sklearn.tree import DecisionTreeClassifier
#sklearn.tree is a package in Scikit Learn library
#tree is a module in this package
#DecisionTreeClassifier is a class in this module
#built-in library for decision tree algorithm

#instance of the class
model = DecisionTreeClassifier();
model.fit(x,y) #input and output dataset

predictions = model.predict([[21,1], [22,0]]) #input set of a 21 year old male and a 22 year old female
predictions



array(['HipHop', 'Dance'], dtype=object)

#### Measure accuracy of the model

In [7]:
#Split dataset into training and test
#allocate 70-80% of the data to training and remaining 20-30% to test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2) #20% of the data is put into test
#this method returns a tuple of size 4


#if you use 80% of the dataset as test, the accuracy drops drastically 
#x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.8)


model = DecisionTreeClassifier();
#for building the model, now you pass training dataset instead of the entire dataset
model.fit(x_train,y_train)

#for predicting values, now you pass the test dataset instead of random input values
predictions = model.predict(x_test)


from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, predictions)
score

0.75

#### Persisting Models
##### Saving the given model so that it can be simply loaded and reused wothout re-writing the code again

In [13]:
import joblib

joblib.dump(model, 'music-recommender.joblib')

#use the saved model
model_new = joblib.load('music-recommender.joblib')
pred = model_new.predict([[21,1]])
pred



array(['HipHop'], dtype=object)

#### Visualize the decision tree

In [16]:
from sklearn import tree
tree.export_graphviz(model, out_file='music-recommender.dot', 
                     feature_names=['age', 'gender'], class_names=sorted(y.unique()), 
                     label='all', rounded=True, filled=True)