In [80]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn import preprocessing
from sklearn.tree import export_graphviz
%matplotlib inline

In [81]:
def score_in_percent (a,b):
    return (sum(a==b)*100)/len(a)

In [82]:
train = pd.read_csv("C:\\Users\\user\\Titanic\\train.csv")
test =  pd.read_csv("C:\\Users\\user\\Titanic\\test.csv")

In [83]:
# store it as Y
Y_train = train["Survived"]
train.drop(["Survived"], axis=1, inplace=True)

In [84]:
num_train = len(train)
all_data = pd.concat([train, test])

In [85]:
all_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,3.0,80.0,8.0,9.0,512.3292


In [86]:
all_data.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,1309,1309,1309,295,1307
unique,1307,2,929,186,3
top,"Connolly, Miss. Kate",male,CA. 2343,C23 C25 C27,S
freq,2,843,11,6,914


In [87]:
# Populating null fare value with median of train set
all_data["Fare"]=all_data["Fare"].fillna(train["Fare"].median())
# Populating null age value with median of train set
all_data["Age"]=all_data["Age"].fillna(train["Age"].median())
# Populating missing embarked with most frequent value - S
all_data["Embarked"]=all_data["Embarked"].fillna("S")
# Creating new feature as Title
all_data['Title'] = all_data.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
# Drop cabin due to too many null values
all_data.drop(["Cabin","Name","Ticket","PassengerId"], axis=1, inplace=True)
# Converting sex into binary
sex_mapping = {"male": 0, "female": 1}
all_data['Sex'] = all_data['Sex'].map(sex_mapping)

In [88]:
all_data = pd.get_dummies(all_data)

In [89]:
X_train = all_data[:num_train]
X_test = all_data[num_train:]

In [90]:
X_train, X_cv, y_train, y_cv = train_test_split( X_train, Y_train, test_size = 0.3, random_state = 100)

In [101]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=6, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
y_pred = clf_gini.predict(X_cv)
score_in_percent(y_pred,y_cv)

80.223880597014926

In [102]:
# This statement builds a dot file.
cols = list(X_train.columns.values)
tree.export_graphviz(clf_gini, out_file='treedummies.dot',feature_names  = cols) 

In [103]:
y_test_pred = clf_gini.predict(X_test)

In [104]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_test_pred
    })
submission.to_csv('treedummies.csv', index=False) # LB : 0.78469

In [105]:
# Let's do some feature engineering now and add few features
# Adding total family size and droppign parch and sibsp.
X_train['FamSi'] = X_train["SibSp"] + X_train["Parch"] 
X_cv['FamSi']    = X_cv["SibSp"] + X_cv["Parch"] 
X_test['FamSi']    = X_test["SibSp"] + X_test["Parch"] 
X_train.drop(["SibSp","Parch"], axis=1, inplace=True)
X_cv.drop(["SibSp","Parch"], axis=1, inplace=True)
X_test.drop(["SibSp","Parch"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [106]:
clf_gini = DecisionTreeClassifier(criterion = "entropy", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
y_pred = clf_gini.predict(X_cv)
score_in_percent(y_pred,y_cv)

80.97014925373135

In [107]:
# This statement builds a dot file.
cols = list(X_train.columns.values)
tree.export_graphviz(clf_gini, out_file='treewithlargetrainfemsi.dot',feature_names  = cols) 

In [108]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_test_pred
    })
submission.to_csv('submission_with_3depth_larger_train_famsi.csv', index=False) # LB : 0.74163

In [34]:
X_train.to_csv("X_train.csv", index=False)

In [35]:
X_subset = pd.concat([X_train, y_train],axis=1)

In [36]:
X_subset.to_csv("X_subset.csv", index=False)

In [109]:
# Decision tree tuning
for crtr in ['gini','entropy']:
    for md in [3,4,5,6,7,8,9,10]:
        for spltr in ['best','random']:
            for mss in [6,10,16,26,42]:
                for msl in [6,10,16,26,42]:
                    dts = DecisionTreeClassifier(class_weight=None, criterion=crtr, max_depth=md,
                                max_features=None, max_leaf_nodes=None, min_samples_leaf=msl,
                                min_samples_split=mss, min_weight_fraction_leaf=0.0,
                                presort=False, random_state=100, splitter=spltr)
                    dts.fit(X_train, y_train)
                    y_pred = dts.predict(X_cv)
                    sip=score_in_percent(y_pred,y_cv)
                    print("score for {} criterion, {} max_depth, {} splitter, {} min_samples_split, {} min_samples_leaf is {}".format(crtr,md,spltr,mss,msl,sip))

score for gini criterion, 3 max_depth, best splitter, 6 min_samples_split, 6 min_samples_leaf is 80.59701492537313
score for gini criterion, 3 max_depth, best splitter, 6 min_samples_split, 10 min_samples_leaf is 79.4776119402985
score for gini criterion, 3 max_depth, best splitter, 6 min_samples_split, 16 min_samples_leaf is 79.4776119402985
score for gini criterion, 3 max_depth, best splitter, 6 min_samples_split, 26 min_samples_leaf is 78.35820895522389
score for gini criterion, 3 max_depth, best splitter, 6 min_samples_split, 42 min_samples_leaf is 78.35820895522389
score for gini criterion, 3 max_depth, best splitter, 10 min_samples_split, 6 min_samples_leaf is 80.59701492537313
score for gini criterion, 3 max_depth, best splitter, 10 min_samples_split, 10 min_samples_leaf is 79.4776119402985
score for gini criterion, 3 max_depth, best splitter, 10 min_samples_split, 16 min_samples_leaf is 79.4776119402985
score for gini criterion, 3 max_depth, best splitter, 10 min_samples_split,

In [113]:
'''
score for gini criterion, 3 max_depth, random splitter, 6 min_samples_split, 6 min_samples_leaf is 82.08955223880596
score for gini criterion, 4 max_depth, random splitter, 6 min_samples_split, 6 min_samples_leaf is 82.46268656716418
score for entropy criterion, 7 max_depth, best splitter, 6 min_samples_split, 6 min_samples_leaf is 83.58208955223881
score for entropy criterion, 8 max_depth, best splitter, 6 min_samples_split, 6 min_samples_leaf is 83.95522388059702
score for entropy criterion, 9 max_depth, best splitter, 6 min_samples_split, 6 min_samples_leaf is 83.2089552238806
'''

'\nscore for gini criterion, 3 max_depth, random splitter, 6 min_samples_split, 6 min_samples_leaf is 82.08955223880596\nscore for gini criterion, 4 max_depth, random splitter, 6 min_samples_split, 6 min_samples_leaf is 82.46268656716418\nscore for entropy criterion, 7 max_depth, best splitter, 6 min_samples_split, 6 min_samples_leaf is 83.58208955223881\nscore for entropy criterion, 8 max_depth, best splitter, 6 min_samples_split, 6 min_samples_leaf is 83.95522388059702\nscore for entropy criterion, 9 max_depth, best splitter, 6 min_samples_split, 6 min_samples_leaf is 83.2089552238806\n'

In [117]:
clf_tuned = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=4,
                                max_features=None, max_leaf_nodes=None, min_samples_leaf=6,
                                min_samples_split=6, min_weight_fraction_leaf=0.0,
                                presort=False, random_state=100, splitter='random')
clf_tuned.fit(X_train, y_train)
y_pred = clf_tuned.predict(X_cv)
y_test_pred = clf_tuned.predict(X_test)
score_in_percent(y_pred,y_cv)

82.462686567164184

In [118]:
# This statement builds a dot file.
cols = list(X_train.columns.values)
tree.export_graphviz(clf_tuned, out_file='tunedtreewithdummies.dot',feature_names  = cols) 

In [119]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_test_pred
    })
submission.to_csv('tunedtreewithdummies.csv', index=False) # LB : 0.74163