## Exploring a Data Set

* Looking for anomalies and data integrity problems
* Cleaning data
* Massaging data format to be model-ready
* Choosing features and a target
* Train/test split

In [123]:
from os.path import join
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [124]:
# Read the data
path = join('data', 'train.csv')
people = pd.read_csv(path)

# Drop unused column
people.drop('Name', axis=1, inplace=True)


In [125]:
people.describe(include=['int', 'int64', 'float', 'object'])

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,2,,,,681.0,,147,3
top,,,,male,,,,347082.0,,B96 B98,S
freq,,,,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,38.0,1.0,0.0,,31.0,,


In [133]:
# taking care of missing values 
# Method 1: drop entries with missing vaues, if the number of entries with missing values is about 1% of total number of entries
#   people_dummies = people_dummies.dropna()
# Method 2: use SimpleImputer() method to fill the missing values with the mean, mode or median

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(people[:, 4:5])
people[:, 4:5] = imputer.transform(people[:, 4:5])

TypeError: '(slice(None, None, None), 'Age')' is an invalid key

In [114]:
# one-hot encoding
people_dummies = pd.get_dummies(people)

In [115]:
people_dummies.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Ticket_110152,...,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_C,Embarked_Q,Embarked_S
0,1,0,3,22.0,1,0,7.25,0,1,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,1,38.0,1,0,71.2833,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,1,3,26.0,0,0,7.925,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,1,35.0,1,0,53.1,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,3,35.0,0,0,8.05,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [116]:
len(people_dummies.values)

891

In [118]:
# separating the features and target
y = people_dummies['Survived']
X = people_dummies.drop('Survived', axis=1)

In [119]:
# train/test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

print("Training features/target:", X_train.shape, y_train.shape)
print("Testing features/target:", X_test.shape, y_test.shape)

Training features/target: (668, 839) (668,)
Testing features/target: (223, 839) (223,)


## Classification

* Choosing a model
* Feature importances
* Cut points in a decision tree
* Comparing multiple classifiers

In [120]:
# #feature importance
# from sklearn.tree import DecisionTreeClassifier

# tree = DecisionTreeClassifier(max_depth=7, random_state=0)
# tree.fit(X_train, y_train)
# tree.score(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float32').