### Classification using scikit-learn (with pandas)

In [1]:
import csv
import pandas as pd
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
# Read Cities.csv into dataframe, add column for temperature category
f = open('Cities.csv')
cities = pd.read_csv(f)
cats = []
for i in range(0,len(cities)):
    if cities.ix[i]['temperature'] < 5:
        cats.append('cold')
    elif cities.ix[i]['temperature'] < 9:
        cats.append('cool')
    elif cities.ix[i]['temperature'] < 15:
        cats.append('warm')
    else: cats.append('hot')
cities['category'] = cats
print "cold:", len(cities[(cities.category == 'cold')])
print "cool:", len(cities[(cities.category == 'cool')])
print "warm:", len(cities[(cities.category == 'warm')])
print "hot:", len(cities[(cities.category == 'hot')])

cold: 17
cool: 92
warm: 79
hot: 25


In [3]:
# Create training and test sets for cities data
numitems = len(cities)
percenttrain = 0.85
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
print 'Training set', numtrain, 'items'
print'Test set', numtest, 'items'
citiesTrain = cities[0:numtrain]
citiesTest = cities[numtrain:numitems]

Training set 181 items
Test set 32 items


### K-nearest-neighbors classification

In [4]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
max = 0
for neighbors in range(1,100):
    classifier = KNeighborsClassifier(neighbors)
    classifier.fit(citiesTrain[features], citiesTrain['category'])
    predictions = classifier.predict(citiesTest[features])
# Calculate accuracy
    numtrain = len(citiesTrain)
    numtest = len(citiesTest)
    correct = 0
    for i in range(0,numtest):
#        print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
        if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1

    if max < float(correct)/float(numtest):
        max = float(correct)/float(numtest)
        print neighbors
        print 'Percent correct:', float(correct)/float(numtest)
# Comment out print, play with other values for neighbors, try 'temperature'
# as feature

1
Percent correct: 0.6875
4
Percent correct: 0.75
5
Percent correct: 0.84375


### <font color="green">Your Turn: K-nearest-neighbors on World Cup Data</font>

In [5]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# This box does all the set-up, including reordering the data to avoid team bias.
f = open('Players.csv')
players = pd.read_csv(f)
players = players.sort_values(by='surname')
players = players.reset_index(drop=True)
numitems = len(players)
percenttrain = 0.95
numtrain = int(numitems*percenttrain)
numtest = numitems - numtrain
playersTrain = players[0:numtrain]
playersTest = players[numtrain:numitems]
print 'Training set', numtrain, 'items'
print'Test set', numtest, 'items'

Training set 565 items
Test set 30 items


In [17]:
import itertools
def all_subsets(lst):
    subsets = []
    for size in range(1, len(lst)+1):
        for subset in itertools.combinations(lst, size):
            subsets.append(list(subset))
    return subsets

features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
#print all_subsets(features)

for feature in all_subsets(features):
    print '==New loop is starting=='
    max = 0
    for neighbors in range(1,100):
        classifier = KNeighborsClassifier(neighbors)
        classifier.fit(playersTrain[feature], playersTrain['position'])
        predictions = classifier.predict(playersTest[feature])
    # Calculate accuracy
        numtrain = len(playersTrain)
        numtest = len(playersTest)
        correct = 0
        for i in range(0,numtest):
    #        print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
            if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1

        if max < float(correct)/float(numtest):
            max = float(correct)/float(numtest)
            print neighbors
            print 'Percent correct:', float(correct)/float(numtest)

==New loop is starting==
1
Percent correct: 0.266666666667
3
Percent correct: 0.4
5
Percent correct: 0.466666666667
==New loop is starting==
1
Percent correct: 0.3
3
Percent correct: 0.4
11
Percent correct: 0.433333333333
15
Percent correct: 0.533333333333
53
Percent correct: 0.566666666667
73
Percent correct: 0.6
==New loop is starting==
1
Percent correct: 0.5
==New loop is starting==
1
Percent correct: 0.333333333333
3
Percent correct: 0.4
6
Percent correct: 0.466666666667
7
Percent correct: 0.566666666667
==New loop is starting==
1
Percent correct: 0.266666666667
6
Percent correct: 0.466666666667
==New loop is starting==
1
Percent correct: 0.4
3
Percent correct: 0.433333333333
5
Percent correct: 0.466666666667
13
Percent correct: 0.5
==New loop is starting==
1
Percent correct: 0.5
3
Percent correct: 0.566666666667
==New loop is starting==
1
Percent correct: 0.433333333333
5
Percent correct: 0.466666666667
8
Percent correct: 0.533333333333
==New loop is starting==
1
Percent correct: 

In [13]:
# This box does the classification.
# Try different features and different numbers of neighbors.
# What's the highest accuracy you can get?

# Predict temperature category from other features
features = ['shots', 'tackles']
max = 0
for neighbors in range(1,100):
    classifier = KNeighborsClassifier(neighbors)
    classifier.fit(playersTrain[features], playersTrain['position'])
    predictions = classifier.predict(playersTest[features])
# Calculate accuracy
    numtrain = len(playersTrain)
    numtest = len(playersTest)
    correct = 0
    for i in range(0,numtest):
#        print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
        if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1

    if max < float(correct)/float(numtest):
        max = float(correct)/float(numtest)
        print neighbors
        print 'Percent correct:', float(correct)/float(numtest)
# Comment out print, play with other values for neighbors, try 'temperature'
# as feature

1
Percent correct: 0.666666666667
8
Percent correct: 0.7
40
Percent correct: 0.733333333333


### Decision tree classification

In [25]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
featurevals = citiesTrain[features]
labels = citiesTrain['category']
dt = DecisionTreeClassifier(min_samples_split=10) # parameter is optional
dt.fit(featurevals,labels)
predictions = dt.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(0,numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print 'Percent correct:', float(correct)/float(numtest)
# Comment out print, play with other values for min_samples_split, try 'temperature'
# as feature 

Percent correct: 0.75


### "Forest" of decision trees

In [33]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
featurevals = citiesTrain[features]
labels = citiesTrain['category']
rf = RandomForestClassifier(n_estimators=10) # number of different decision trees
rf.fit(featurevals,labels)
predictions = rf.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(0,numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print 'Percent correct:', float(correct)/float(numtest)
# Comment out print, play with other values for n_estimators

Percent correct: 0.75


### <font color="green">Your Turn: Decision tree and forest of trees on World Cup Data</font>

In [42]:
# SINGLE TREE
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features and different values for min_samples_split.
# What's the highest accuracy you can get?

import itertools
def all_subsets(lst):
    subsets = []
    for size in range(1, len(lst)+1):
        for subset in itertools.combinations(lst, size):
            subsets.append(list(subset))
    return subsets

features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
#print all_subsets(features)

# Predict temperature category from other features
for feature in all_subsets(features):
    print '==New loop is starting=='
    print feature
    featurevals = playersTrain[feature]
    labels = playersTrain['position']
    dt = DecisionTreeClassifier(min_samples_split=10) # parameter is optional
    dt.fit(featurevals,labels)
    predictions = dt.predict(playersTest[feature])
    # Calculate accuracy
    numtrain = len(playersTrain)
    numtest = len(playersTest)
    correct = 0
    for i in range(0,numtest):
    #    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
        if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
    print 'Percent correct:', float(correct)/float(numtest)
    # Comment out print, play with other values for min_samples_split, try 'temperature'
    # as feature 



==New loop is starting==
['minutes']
Percent correct: 0.366666666667
==New loop is starting==
['shots']
Percent correct: 0.5
==New loop is starting==
['passes']
Percent correct: 0.5
==New loop is starting==
['tackles']
Percent correct: 0.366666666667
==New loop is starting==
['saves']
Percent correct: 0.466666666667
==New loop is starting==
['minutes', 'shots']
Percent correct: 0.466666666667
==New loop is starting==
['minutes', 'passes']
Percent correct: 0.433333333333
==New loop is starting==
['minutes', 'tackles']
Percent correct: 0.433333333333
==New loop is starting==
['minutes', 'saves']
Percent correct: 0.366666666667
==New loop is starting==
['shots', 'passes']
Percent correct: 0.5
==New loop is starting==
['shots', 'tackles']
Percent correct: 0.533333333333
==New loop is starting==
['shots', 'saves']
Percent correct: 0.5
==New loop is starting==
['passes', 'tackles']
Percent correct: 0.366666666667
==New loop is starting==
['passes', 'saves']
Percent correct: 0.5
==New loop is

In [43]:
# FOREST OF TREES
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different values for n_estimators.
# What's the highest accuracy you can get?

# SINGLE TREE
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features and different values for min_samples_split.
# What's the highest accuracy you can get?

import itertools
def all_subsets(lst):
    subsets = []
    for size in range(1, len(lst)+1):
        for subset in itertools.combinations(lst, size):
            subsets.append(list(subset))
    return subsets

features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
#print all_subsets(features)

# Predict temperature category from other features
for feature in all_subsets(features):
    print '==New loop is starting=='
    print feature
    featurevals = playersTrain[feature]
    labels = playersTrain['position']
    dt = RandomForestClassifier(n_estimators=10) # parameter is optional
    dt.fit(featurevals,labels)
    predictions = dt.predict(playersTest[feature])
    # Calculate accuracy
    numtrain = len(playersTrain)
    numtest = len(playersTest)
    correct = 0
    for i in range(0,numtest):
    #    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
        if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
    print 'Percent correct:', float(correct)/float(numtest)
    # Comment out print, play with other values for min_samples_split, try 'temperature'
    # as feature 



==New loop is starting==
['minutes']
Percent correct: 0.366666666667
==New loop is starting==
['shots']
Percent correct: 0.6
==New loop is starting==
['passes']
Percent correct: 0.433333333333
==New loop is starting==
['tackles']
Percent correct: 0.533333333333
==New loop is starting==
['saves']
Percent correct: 0.466666666667
==New loop is starting==
['minutes', 'shots']
Percent correct: 0.466666666667
==New loop is starting==
['minutes', 'passes']
Percent correct: 0.433333333333
==New loop is starting==
['minutes', 'tackles']
Percent correct: 0.433333333333
==New loop is starting==
['minutes', 'saves']
Percent correct: 0.333333333333
==New loop is starting==
['shots', 'passes']
Percent correct: 0.6
==New loop is starting==
['shots', 'tackles']
Percent correct: 0.666666666667
==New loop is starting==
['shots', 'saves']
Percent correct: 0.5
==New loop is starting==
['passes', 'tackles']
Percent correct: 0.4
==New loop is starting==
['passes', 'saves']
Percent correct: 0.5
==New loop is

### Naive Bayes classification

In [45]:
# Predict temperature category from other features
features = ['longitude', 'latitude']
featurevals = citiesTrain[features]
labels = citiesTrain['category']
nb = GaussianNB()
nb.fit(featurevals,labels)
predictions = nb.predict(citiesTest[features])
# Calculate accuracy
numtrain = len(citiesTrain)
numtest = len(citiesTest)
correct = 0
for i in range(0,numtest):
#    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
    if predictions[i] == citiesTest.ix[numtrain+i]['category']: correct +=1
print 'Percent correct:', float(correct)/float(numtest)
# Comment out print, try removing 'longitude'

Percent correct: 0.78125


### <font color="green">Your Turn: Naive Bayes on World Cup Data</font>

In [48]:
# Predict position from one or more of minutes, shots, passes, tackles, saves.
# Try different features. What's the highest accuracy you can get?

import itertools
def all_subsets(lst):
    subsets = []
    for size in range(1, len(lst)+1):
        for subset in itertools.combinations(lst, size):
            subsets.append(list(subset))
    return subsets

features = ['minutes', 'shots', 'passes', 'tackles', 'saves']
#print all_subsets(features)


for feature in all_subsets(features):
    print '==New loop is starting=='
    print feature
    featurevals = playersTrain[feature]
    labels = playersTrain['position']
    nb = GaussianNB()
    nb.fit(featurevals,labels)
    predictions = nb.predict(playersTest[feature])
    
    # Calculate accuracy
    numtrain = len(playersTrain)
    numtest = len(playersTest)
    correct = 0
    for i in range(0,numtest):
    #    print 'Predicted:', predictions[i], ' Actual:', citiesTest.ix[numtrain+i]['category']
        if predictions[i] == playersTest.ix[numtrain+i]['position']: correct +=1
    print 'Percent correct:', float(correct)/float(numtest)
    # Comment out print, play with other values for min_samples_split, try 'temperature'
    # as feature 



==New loop is starting==
['minutes']
Percent correct: 0.4
==New loop is starting==
['shots']
Percent correct: 0.366666666667
==New loop is starting==
['passes']
Percent correct: 0.566666666667
==New loop is starting==
['tackles']
Percent correct: 0.433333333333
==New loop is starting==
['saves']
Percent correct: 0.466666666667
==New loop is starting==
['minutes', 'shots']
Percent correct: 0.433333333333
==New loop is starting==
['minutes', 'passes']
Percent correct: 0.466666666667
==New loop is starting==
['minutes', 'tackles']
Percent correct: 0.433333333333
==New loop is starting==
['minutes', 'saves']
Percent correct: 0.4
==New loop is starting==
['shots', 'passes']
Percent correct: 0.433333333333
==New loop is starting==
['shots', 'tackles']
Percent correct: 0.6
==New loop is starting==
['shots', 'saves']
Percent correct: 0.433333333333
==New loop is starting==
['passes', 'tackles']
Percent correct: 0.4
==New loop is starting==
['passes', 'saves']
Percent correct: 0.566666666667
==