# Machine Learning Steps
1. Import the Data
2. Clean the Data
3. Split the Data into Training / Test Sets
4. Create a Model
5. Train the Model
6. Make Predictions
7. Evaluate and Improve

## Tools : 
* Jupyter
* Sklearn

SRC : https://www.youtube.com/watch?v=7eh4d6sabA0&t=139s

src1 : https://www.kaggle.com/





## Loading a file

In [None]:
import pandas as pd
df = pd.read_csv('/content/vgsales.csv')
df.shape

(16598, 11)

In [None]:
df.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [None]:
df.values

array([[1, 'Wii Sports', 'Wii', ..., 3.77, 8.46, 82.74],
       [2, 'Super Mario Bros.', 'NES', ..., 6.81, 0.77, 40.24],
       [3, 'Mario Kart Wii', 'Wii', ..., 3.79, 3.31, 35.82],
       ...,
       [16598, 'SCORE International Baja 1000: The Official Game', 'PS2',
        ..., 0.0, 0.0, 0.01],
       [16599, 'Know How 2', 'DS', ..., 0.0, 0.0, 0.01],
       [16600, 'Spirits & Spells', 'GBA', ..., 0.0, 0.0, 0.01]],
      dtype=object)

## Create and Train a model

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier #most popular ML library

music_data = pd.read_csv('music.csv')
#PREPARING THE DATA
X = music_data.drop(columns=['genre'])
y = music_data['genre']

#LEARNING 
model = DecisionTreeClassifier()
model.fit(X, y) 
#PREDICT
predictions = model.predict([[21, 1], [22, 0]]) #fit : find pattern in data
predictions


array(['HipHop', 'Dance'], dtype=object)

## Calculate accuracy of the model in order to choose which one is better

In [None]:


import pandas as pd
from sklearn.tree import DecisionTreeClassifier #most popular ML library
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

music_data = pd.read_csv('music.csv')
#music_data.shape
#music_data
X = music_data.drop(columns=['genre'])
y = music_data['genre']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = DecisionTreeClassifier()
model.fit(X_train, y_train) #fit : find pattern in data
predictions = model.predict(X_test)

score = accuracy_score(y_test, predictions)

score #prediction changes each execution
                       

0.5

## Persisting Models

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib

# music_data = pd.read_csv('music.csv')
# X = music_data.drop(columns=['genre'])
# y = music_data['genre']

#creating a model
# model = DecisionTreeClassifier()
#train
# model.fit(X, y) 
 # to save the model
# joblib.dump(model, 'music-recommender.joblib')

# to load the model
model = joblib.load('/content/music-recommender.joblib') 
#fit : find pattern in data
predictions = model.predict([[21, 1], [22, 0]]) 
 #print output
predictions 


array(['HipHop', 'Dance'], dtype=object)

## Visualizing Decision Trees

In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier #most popular ML library
from sklearn import tree

music_data = pd.read_csv('music.csv')
#PREPARING THE DATA
X = music_data.drop(columns=['genre'])
y = music_data['genre']

#create 
model = DecisionTreeClassifier()
#train
model.fit(X, y)

tree.export_graphviz(model, out_file='music-recommender.dot',
                    feature_names=['age', 'gender'],
                    class_names=sorted(y.unique()),
                     label='all',
                     rounded=True,
                     filled=True)
