## Video 4.3 - Supervised Learning with scikit-learn

In [None]:
import pandas as pd

fname = '~/data/titanic/train.csv'

data = pd.read_csv(fname)

In [None]:
data.head()

#### Using just one feature

In [None]:
data['IsFemale'] = (data['Sex'] == 'female')

samples = data[['IsFemale']]  # X
labels = data['Survived']  # y

#### Train/test split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(samples,
                                                    labels, 
                                                    train_size=0.7, 
                                                    random_state=0)

print("Samples: train={}, test={}".format(len(X_train), len(X_test)))

In [None]:
X_train['IsFemale'].value_counts()

#### Dummy Classifier (most frequent class)

In [None]:
from sklearn.dummy import DummyClassifier

clf_dummy = DummyClassifier(strategy="most_frequent")
clf_dummy.fit(X_train, y_train)

y_predicted = clf_dummy.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy={}".format(accuracy_score(y_test, y_predicted)))

#### Random forest classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)

print("Accuracy={}".format(accuracy_score(y_test, y_predicted)))

#### Using more features

In [None]:
samples = data[['IsFemale', 'Pclass']]
labels = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(samples,
                                                    labels, 
                                                    train_size=0.7, 
                                                    random_state=0)

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)

print("Accuracy={}".format(accuracy_score(y_test, y_predicted)))

In [None]:
data['AgeSentinel'] = data['Age'].fillna(-100)

In [None]:
features = ['IsFemale', 'Pclass', 'AgeSentinel']
samples = data[features]
labels = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(samples,
                                                    labels, 
                                                    train_size=0.7, 
                                                    random_state=0)

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)

print("Accuracy={}".format(accuracy_score(y_test, y_predicted)))

In [None]:
features = ['IsFemale', 'Pclass', 'AgeSentinel', 'Fare']
samples = data[features]
labels = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(samples,
                                                    labels, 
                                                    train_size=0.7, 
                                                    random_state=0)

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)

print("Accuracy={}".format(accuracy_score(y_test, y_predicted)))

In [None]:
data['FamilySize'] = data['SibSp'] + data['Parch']

features = ['IsFemale', 'Pclass', 'AgeSentinel', 'Fare', 'FamilySize']

samples = data[features]
labels = data['Survived']

X_train, X_test, y_train, y_test = train_test_split(samples,
                                                    labels, 
                                                    train_size=0.7, 
                                                    random_state=0)

In [None]:
clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)

y_predicted = clf.predict(X_test)

print("Accuracy={}".format(accuracy_score(y_test, y_predicted)))

#### Feature importance

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt

plt.bar(range(len(features)), clf.feature_importances_, tick_label=features)

plt.show()

#### What else? (Exercise)

- Different features?
- Different classifiers?