## Example to using scikit learn library

In [19]:
# Import the Numpy library
import numpy as np
import pandas as pd

In [55]:
train_path = "../data/train.csv"
train = pd.read_csv(train_path)

test_path = "../data/test.csv"
test = pd.read_csv(test_path)

train['Age'] = train['Age'].fillna(-1)
##train['Age'][train['Age'] > 0]
##train[train['Age'] == -1]

## How many people in your training set survived the disaster with the Titanic? 

In [56]:
# Passengers that survived vs passengers that passed away
print(train['Survived'].value_counts())


# As proportions
print(train['Survived'].value_counts(normalize=True))

# Males that survived vs males that passed away
print(train['Survived'][train['Sex'] == 'male'].value_counts())

# Females that survived vs Females that passed away
print(train['Survived'][train['Sex'] == 'female'].value_counts())

# Normalized male survival
print(train['Survived'][train['Sex'] == 'male'].value_counts(normalize=True))

# Normalized female survival
print(train['Survived'][train['Sex'] == 'female'].value_counts(normalize=True))


0    549
1    342
dtype: int64
0    0.616162
1    0.383838
dtype: float64
0    468
1    109
dtype: int64
1    233
0     81
dtype: int64
0    0.811092
1    0.188908
dtype: float64
1    0.742038
0    0.257962
dtype: float64


## Does age play a role?

In [57]:
# Create the column Child and assign to 'NaN'
train["Child"] = float('NaN')

# Assign 1 to passengers under 18, 0 to those 18 or older. Print the new column.
train['Child'][train['Age']<18] = 1
train['Child'][train['Age']>=18] = 0
print(train["Child"])


# Print normalized Survival Rates for passengers under 18
print(train["Survived"][train["Child"] == 1].value_counts(normalize = True))

# Print normalized Survival Rates for passengers 18 or older
print(train["Survived"][train["Child"] == 0].value_counts(normalize = True))


0     0
1     0
2     0
3     0
4     0
5     1
6     0
7     1
8     0
9     1
10    1
11    0
12    0
13    0
14    1
...
876    0
877    0
878    1
879    0
880    0
881    0
882    0
883    0
884    0
885    0
886    0
887    0
888    1
889    0
890    0
Name: Child, Length: 891, dtype: float64
0    0.610345
1    0.389655
dtype: float64
0    0.618968
1    0.381032
dtype: float64


## First Prediction

In [58]:
# Create a copy of test: test_one
test_one = test

# Initialize a Survived column to 0
test_one['Survived'] = 0

# Set Survived to 1 if Sex equals "female" and print the `Survived` column from `test_one`
test_one['Survived'][test['Sex'] == 'female'] = 1
test_one['Survived'][test['Sex'] == 'male'] = 0
print(test_one['Survived'])

0     0
1     1
2     0
3     0
4     1
5     0
6     1
7     0
8     1
9     0
10    0
11    0
12    1
13    0
14    1
...
403    0
404    0
405    0
406    0
407    0
408    1
409    1
410    1
411    1
412    1
413    0
414    1
415    0
416    0
417    0
Name: Survived, Length: 418, dtype: int64


## Intro to decision trees

In [59]:
# Import the Numpy library
import numpy as np
# Import 'tree' from scikit-learn library
from sklearn import tree

## Cleaning and Formatting your Data

In [60]:
# Convert the male and female groups to integer form
train["Sex"][train["Sex"] == "male"] = 0
train["Sex"][train["Sex"] == "female"] = 1

# Impute the Embarked variable
train["Embarked"] = train["Embarked"].fillna('S')

# Convert the Embarked classes to integer form
train["Embarked"][train["Embarked"] == "S"] = 0
train["Embarked"][train["Embarked"] == "C"] = 1
train["Embarked"][train["Embarked"] == "Q"] = 2

#Print the Sex and Embarked columns
print(train['Sex'])
print(train['Embarked'])


0     0
1     1
2     1
3     1
4     0
5     0
6     0
7     0
8     1
9     1
10    1
11    1
12    0
13    0
14    1
...
876    0
877    0
878    0
879    1
880    1
881    0
882    1
883    0
884    0
885    1
886    0
887    1
888    1
889    0
890    0
Name: Sex, Length: 891, dtype: object
0     0
1     1
2     0
3     0
4     0
5     2
6     0
7     0
8     0
9     1
10    0
11    0
12    0
13    0
14    0
...
876    0
877    0
878    0
879    1
880    0
881    0
882    0
883    0
884    0
885    2
886    0
887    0
888    0
889    1
890    2
Name: Embarked, Length: 891, dtype: object


## Creating first decision tree

In [63]:
# Print the train data to see the available features
##print(train)

# Create the target and features numpy arrays: target, features_one
target = train['Survived'].values
features_one = train[["Pclass", "Sex", "Age", "Fare"]].values

# Fit your first decision tree: my_tree_one
my_tree_one = tree.DecisionTreeClassifier()
my_tree_one = my_tree_one.fit(features_one, target)

# Look at the importance and score of the included features
print(my_tree_one.feature_importances_)
print(my_tree_one.score(features_one, target))

[ 0.10826092  0.31117584  0.25403794  0.3265253 ]
0.979797979798


## Predict and submit to Kaggle