# Introduction to sklearn: `fit`, `predict`, and `score`

In [None]:
%matplotlib inline
import sklearn
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import codecs
import json

### Task: Professional athelete classification

From a pool of professional athletes, can we predict who is a sumo wrestler and who is a Major League Baseball player?

![Sumo vs. Major League Baseball](sumo-vs-mlb.jpg)


#### Sumos
Data on sumo wrestlers was obtained by issuing this [query](http://tinyurl.com/m5k2ej8) on FreeBase.

In [None]:
sumo_json = json.loads(codecs.open("sumos.json", encoding='utf-8').read())
sumo = pd.DataFrame(sumo_json['result'])
sumo['height_cm'] = sumo.height_meters * 100
sumo.tail()

#### Baseball players

The dataset with height and weight for players in Major League Baseball (MLB) was downloaded from this [HTML page](http://wiki.stat.ucla.edu/socr/index.php/SOCR_Data_MLB_HeightsWeights#SOCR_Data_-_1035_Records_of_Heights_.28in.29_and_Weights_.28lbs.29_of_Major_League_Baseball_Players), copied into a spreadsheet and exported as a csv-file.

In [None]:
mlb = pd.read_csv("mlb_heights.csv", encoding='utf-8')
mlb['height_cm'] = mlb.Height_inches * 2.54
mlb['weight_kg'] = mlb.Weight_pounds * 0.45359237
mlb.tail()

#### Combining MLB players and sumo wrestlers

In [None]:
sumo_vs_mlb = pd.concat([sumo[['height_cm', 'weight_kg']], 
                         mlb.ix[100:200, ['height_cm', 'weight_kg']]])
sumo_vs_mlb.tail()

In [None]:
is_sumo = np.ones(len(sumo_vs_mlb))
is_sumo[len(sumo):] = 0
is_sumo

In [None]:
colors = np.where(is_sumo, 'b', 'r')
sumo_vs_mlb.plot(kind='scatter', x='weight_kg', y='height_cm', color=colors);

### From pandas to sklearn

In [None]:
X = sumo_vs_mlb[['weight_kg', 'height_cm']].values
print("shape", X.shape)
X

#### Creating a fixed train and test set

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, is_sumo, random_state=100)
print("Train shape", X_train.shape, "Test shape", X_test.shape)

### Fitting the classifier

In [None]:
from sklearn.linear_model import Perceptron
perceptron = Perceptron()
perceptron

In [None]:
perceptron.fit(X_train, y_train);

In [None]:
perceptron.predict(X_test)

### Evaluation

In [None]:
y_pred = perceptron.predict(X_test)
y_pred

In [None]:
n_correct = (y_pred == y_test).sum()
print("Accuracy", n_correct / float(y_test.shape[0]))

In [None]:
perceptron.score(X_test, y_test)

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

print("Precision", precision_score(y_test, y_pred))
print("Recall", recall_score(y_test, y_pred))
print("F1 (balanced)", f1_score(y_test, y_pred))

In [None]:
print(classification_report(y_test, y_pred))

### Estimated parameters of the model

In [None]:
print("shape", perceptron.coef_.shape)
perceptron.coef_

In [None]:
print("shape", perceptron.intercept_.shape)
perceptron.intercept_

#### Plotting the decision boundary

In [None]:
def decision_boundary(w, bias, dist=0, x_start=0, x_end=300):
    y_start = -(x_start * w[0] + bias - dist) / w[1]
    y_end = -(x_end * w[0] + bias - dist) / w[1]
    return [x_start, x_end], [y_start, y_end]

xx, yy = decision_boundary(perceptron.coef_[0], perceptron.intercept_[0])
sumo_vs_mlb.plot(kind='scatter', x='weight_kg', y='height_cm', color=colors);
plt.xlim(60, 300)
plt.ylim(160, 210)
plt.plot(xx, yy);

### All classifiers support a uniform interface

#### Same procecedure as above, but with logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

logistic = LogisticRegression()
logistic

In [None]:
logistic.fit(X_train, y_train)
xx, yy = decision_boundary(logistic.coef_[0], logistic.intercept_[0])
sumo_vs_mlb.plot(kind='scatter', x='weight_kg', y='height_cm', color=colors)
plt.xlim(60, 300)
plt.ylim(160, 210)
plt.plot(xx, yy);

In [None]:
y_pred = logistic.predict(X_test)
print(classification_report(y_test, y_pred))