In [1]:
import numpy as np
import pandas as pd

In [2]:
# an example of dealing with categorical features
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'London', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

In [3]:
from sklearn.feature_extraction import DictVectorizer

vec = DictVectorizer()
vec

DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=True)

In [7]:
vec.fit_transform(measurements).toarray()

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.]])

In [8]:
vec.get_feature_names()

['city=Dubai', 'city=London', 'city=San Francisco', 'temperature']

In [10]:
# Now Dealing with Titanic dataset

titanic = pd.read_csv('datasets/titanic3.csv')
print(titanic.columns)

Index(['pclass', 'survived', 'name', 'sex', 'age', 'sibsp', 'parch', 'ticket',
       'fare', 'cabin', 'embarked', 'boat', 'body', 'home.dest'],
      dtype='object')


In [11]:
titanic.head()

Unnamed: 0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1,1,"Allen, Miss. Elisabeth Walton",female,29.0,0,0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,1,1,"Allison, Master. Hudson Trevor",male,0.9167,1,2,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,1,0,"Allison, Miss. Helen Loraine",female,2.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,1,0,"Allison, Mr. Hudson Joshua Creighton",male,30.0,1,2,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",female,25.0,1,2,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


In [15]:
labels = titanic.survived.values

features = titanic[['pclass','sex','age','sibsp','parch','fare','embarked']]


In [16]:
features.head()

Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked
0,1,female,29.0,0,0,211.3375,S
1,1,male,0.9167,1,2,151.55,S
2,1,female,2.0,1,2,151.55,S
3,1,male,30.0,1,2,151.55,S
4,1,female,25.0,1,2,151.55,S


In [17]:
pd.get_dummies(features).head()

Unnamed: 0,pclass,age,sibsp,parch,fare,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,1,29.0,0,0,211.3375,1,0,0,0,1
1,1,0.9167,1,2,151.55,0,1,0,0,1
2,1,2.0,1,2,151.55,1,0,0,0,1
3,1,30.0,1,2,151.55,0,1,0,0,1
4,1,25.0,1,2,151.55,1,0,0,0,1


In [19]:
features_dummies = pd.get_dummies(features, columns=['pclass','sex','embarked'])
features_dummies.head(n=16)

Unnamed: 0,age,sibsp,parch,fare,pclass_1,pclass_2,pclass_3,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
0,29.0,0,0,211.3375,1,0,0,1,0,0,0,1
1,0.9167,1,2,151.55,1,0,0,0,1,0,0,1
2,2.0,1,2,151.55,1,0,0,1,0,0,0,1
3,30.0,1,2,151.55,1,0,0,0,1,0,0,1
4,25.0,1,2,151.55,1,0,0,1,0,0,0,1
5,48.0,0,0,26.55,1,0,0,0,1,0,0,1
6,63.0,1,0,77.9583,1,0,0,1,0,0,0,1
7,39.0,0,0,0.0,1,0,0,0,1,0,0,1
8,53.0,2,0,51.4792,1,0,0,1,0,0,0,1
9,71.0,0,0,49.5042,1,0,0,0,1,1,0,0


In [21]:
data = features_dummies.values
np.isnan(data).any()

True

In [30]:
from sklearn.preprocessing import Imputer  # handle the nan data

X_train, X_test, y_train, y_test = train_test_split(data, labels, random_state=0)

imp = Imputer()
imp.fit(X_train)
# imputer removes the nan value with the mean of the data attribute

X_train_finite = imp.transform(X_train)
X_test_finite = imp.transform(X_test)

In [31]:
np.isnan(X_train_finite).any()

False

In [32]:
# first model : DummyClassifier

from sklearn.dummy import DummyClassifier
clf = DummyClassifier('most_frequent')
clf.fit(X_train_finite, y_train)
print("Accuracy score", clf.score(X_test_finite,y_test))

Accuracy score 0.6341463414634146


In [34]:
#second model : Logistic Regression
from sklearn.linear_model import LogisticRegression

clf1 = LogisticRegression()
clf1.fit(X_train_finite,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
clf1.score(X_test_finite,y_test)

0.7926829268292683

In [38]:
clf1.coef_

array([[-0.03139201, -0.34603546, -0.03868183,  0.00192514,  1.04885915,
         0.23975168, -0.67915315,  1.57305613, -0.96359845,  0.39846019,
         0.10817238, -0.18893388]])

In [40]:
# third model : RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier
clf2 = RandomForestClassifier(n_estimators=500, random_state=0).fit(X_train_finite, y_train)
print("accuracy : ",clf2.score(X_test_finite, y_test))

accuracy :  0.7774390243902439
