# PYTHON FOR MACHINE LEARNING

## Loading Data

In [5]:
import pandas as pd
df = pd.read_csv('vgsales.csv')
df

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


## Describing the dataset

In [7]:
#dim(df)
df.shape

(16598, 11)

In [10]:
#summary(df)
df.describe()

Unnamed: 0,Rank,Year,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
count,16598.0,16327.0,16598.0,16598.0,16598.0,16598.0,16598.0
mean,8300.605254,2006.406443,0.264667,0.146652,0.077782,0.048063,0.537441
std,4791.853933,5.828981,0.816683,0.505351,0.309291,0.188588,1.555028
min,1.0,1980.0,0.0,0.0,0.0,0.0,0.01
25%,4151.25,2003.0,0.0,0.0,0.0,0.0,0.06
50%,8300.5,2007.0,0.08,0.02,0.0,0.01,0.17
75%,12449.75,2010.0,0.24,0.11,0.04,0.04,0.47
max,16600.0,2020.0,41.49,29.02,10.22,10.57,82.74


In [14]:
#dataset in an array format
df.values

array([[1, 'Wii Sports', 'Wii', ..., 3.77, 8.46, 82.74],
       [2, 'Super Mario Bros.', 'NES', ..., 6.81, 0.77, 40.24],
       [3, 'Mario Kart Wii', 'Wii', ..., 3.79, 3.31, 35.82],
       ...,
       [16598, 'SCORE International Baja 1000: The Official Game', 'PS2',
        ..., 0.0, 0.0, 0.01],
       [16599, 'Know How 2', 'DS', ..., 0.0, 0.0, 0.01],
       [16600, 'Spirits & Spells', 'GBA', ..., 0.0, 0.0, 0.01]],
      dtype=object)

### Jupyter Shortcuts
##### press esc to exit edit mode and enter command mode
##### press a while in command mode to add a cell above 
##### press b while in command mode to add a cell below
##### press d twice while in command mode to delete the current cell
##### ctrl+/ is used to comment

###  A Real World ML Problem Statement
#### Recommend music genre based on the music played regularly

##### Loading the dataset

In [17]:
m = pd.read_csv('music.csv')
m

Unnamed: 0,age,gender,genre
0,20,1,HipHop
1,23,1,HipHop
2,25,1,HipHop
3,26,1,Jazz
4,29,1,Jazz
5,30,1,Jazz
6,31,1,Classical
7,33,1,Classical
8,37,1,Classical
9,20,0,Dance


##### Cleaning/Preparing the dataset

In [19]:
#Since we have no null or NA values, we needn't clean anything
#But we need to split the data into input set and output set
#input dataset
x = m.drop(columns=['genre'])
#this method doesn't modify the current dataset, but creates a new dataset without the mentioned column
x

Unnamed: 0,age,gender
0,20,1
1,23,1
2,25,1
3,26,1
4,29,1
5,30,1
6,31,1
7,33,1
8,37,1
9,20,0


In [23]:
#output dataset
y = m['genre']
#creates another dataset with only the mentioned column
y

0        HipHop
1        HipHop
2        HipHop
3          Jazz
4          Jazz
5          Jazz
6     Classical
7     Classical
8     Classical
9         Dance
10        Dance
11        Dance
12     Acoustic
13     Acoustic
14     Acoustic
15    Classical
16    Classical
17    Classical
Name: genre, dtype: object

##### Building the Model
###### Here we use Decision Tree Algorithm for predicting the required genre of music for given age and gender

In [27]:
from sklearn.tree import DecisionTreeClassifier
#sklearn.tree is a package in Scikit Learn library
#tree is a module in this package
#DecisionTreeClassifier is a class in this module
#built-in library for decision tree algorithm

#instance of the class
model = DecisionTreeClassifier();
model.fit(x,y) #input and output dataset

predictions = model.predict([[21,1], [22,0]]) #input set of a 21 year old male and a 22 year old female
predictions



array(['HipHop', 'Dance'], dtype=object)

##### Measure accuracy of the model

In [38]:
#Split dataset into training and test
#allocate 70-80% of the data to training and remaining 20-30% to test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2) #20% of the data is put into test
#this method returns a tuple of size 4


#if you use 80% of the dataset as test, the accuracy drops drastically 
#x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.8)


model = DecisionTreeClassifier();
#for building the model, now you pass training dataset instead of the entire dataset
model.fit(x_train,y_train)

#for predicting values, now you pass the test dataset instead of random input values
predictions = model.predict(x_test)


from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, predictions)
score

0.75