### Classification using scikit-learn (with pandas)

In [1]:
import csv
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [17]:
# Read Cities.csv into dataframe, 
# Note: For a dataframe D and integer i, D.loc[i] is the i-th row of D
cities = pd.read_csv('Cities.csv')
cities.head()

Unnamed: 0,city,country,latitude,longitude,temperature
0,Aalborg,Denmark,57.03,9.92,7.52
1,Aberdeen,United Kingdom,57.17,-2.08,8.1
2,Abisko,Sweden,63.35,18.83,0.2
3,Adana,Turkey,36.99,35.32,18.67
4,Albacete,Spain,39.0,-1.87,12.62


In [31]:
#add column for temperature category
cats = []
for i in range(len(cities)):
    if cities.loc[i]['temperature'] < 5:
        cats.append('cold')
    elif cities.loc[i]['temperature'] < 9:
        cats.append('cool')
    elif cities.loc[i]['temperature'] < 15:
        cats.append('warm')
    else: cats.append('hot')
cities['category'] = cats
print ("cold:", len(cities[(cities.category == 'cold')]))
print ("cool:", len(cities[(cities.category == 'cool')]))
print ("warm:", len(cities[(cities.category == 'warm')]))
print ("hot:", len(cities[(cities.category == 'hot')]))

cold: 17
cool: 92
warm: 79
hot: 25


In [33]:
# Create training and test sets for cities data
numitems = len(cities)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print ('Training set', numtrain, 'items')
print ('Test set', numtest, 'items')
citiesTrain = cities[0:numtrain]
citiesTest = cities[numtrain:]

Training set 181 items
Test set 32 items


In [108]:
import numpy as np

### K-nearest-neighbors classification

In [109]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
neighbors = 8
classifier = KNeighborsClassifier(neighbors)
classifier.fit(citiesTrain[features], citiesTrain['category'])
predictions = classifier.predict(citiesTest[features])


# Calculate accuracy
prediccionesCorrectas=[predictions==citiesTest['category'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.78125


### <font color="green">Your Turn: K-nearest-neighbors on World Cup Data</font>

In [110]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# This cell does all the set-up, including reordering the data to avoid team bias.
players = pd.read_csv('Players.csv')
players = players.sort_values(by='surname')
players = players.reset_index(drop=True)
nItems = len(players)
percenttrain = 0.95
nTrain = int(nItems*percenttrain)
nTest = nItems - nTrain
print ('Training set:', nTrain, 'items')
print('Test set:', nTest, 'items')
playersTrain = players[0:nTrain]
playersTest = players[nTrain:]

Training set: 565 items
Test set: 30 items


In [111]:
# This cell does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
neighbors = 10
classifier = KNeighborsClassifier(neighbors)
classifier.fit(playersTrain[features], playersTrain['position'])
predictions = classifier.predict(playersTest[features])

# Calculate accuracy
prediccionesCorrectas=[predictions==playersTest['position'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.566666666667


## Decision tree classification

In [112]:
# Predict temperature category from other features
features = ['longitude','latitude']
split = 10
dt = DecisionTreeClassifier(min_samples_split=split) # parameter is optional
dt.fit(citiesTrain[features],citiesTrain['category'])
predictions = dt.predict(citiesTest[features])
# Try other values for split, other features

# Calculate accuracy
prediccionesCorrectas=[predictions==citiesTest['category'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.6875


### "Forest" of decision trees

In [113]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
# Try other values for trees
trees = 100
rf = RandomForestClassifier(n_estimators=trees)
rf.fit(citiesTrain[features],citiesTrain['category'])
predictions = rf.predict(citiesTest[features])

# Calculate accuracy
prediccionesCorrectas=[predictions==citiesTest['category'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.78125


### <font color="green">Your Turn: Decision tree and forest of trees on World Cup Data</font>

In [114]:
# SINGLE TREE
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features and different values for min_samples_split.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
split = 10
dt = DecisionTreeClassifier(min_samples_split=split) # parameter is optional
dt.fit(playersTrain[features],playersTrain['position'])
predictions = dt.predict(playersTest[features])

# Calculate accuracy
prediccionesCorrectas=[predictions==playersTest['position'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.466666666667


In [115]:
# FOREST OF TREES
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different values for n_estimators.
# What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
trees = 10
rf = RandomForestClassifier(n_estimators=trees)
rf.fit(playersTrain[features],playersTrain['position'])
predictions = rf.predict(playersTest[features])

# Calculate accuracy
prediccionesCorrectas=[predictions==playersTest['position'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.533333333333


### Naive Bayes classification

In [117]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
nb = GaussianNB()
nb.fit(citiesTrain[features],citiesTrain['category'])
predictions = nb.predict(citiesTest[features])

# Calculate accuracy
prediccionesCorrectas=[predictions==citiesTest['category'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy: 0.78125
Accuracy:  0.78125


### <font color="green">Your Turn: Naive Bayes on World Cup Data</font>

In [119]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features. What's the highest accuracy you can get?
features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
nb = GaussianNB()
nb.fit(playersTrain[features],playersTrain['position'])
predictions = nb.predict(playersTest[features])

# Calculate accuracy
prediccionesCorrectas=[predictions==playersTest['position'].values]
accuracy= np.sum(prediccionesCorrectas) / len(predictions)
print('Accuracy: ', accuracy)

Accuracy:  0.666666666667
