In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("C:\\Users\\user\\Titanic\\train.csv")
test =  pd.read_csv("C:\\Users\\user\\Titanic\\test.csv")

In [3]:
# store it as Y
Y_train = train["Survived"]
train.drop(["Survived"], axis=1, inplace=True)

In [4]:
num_train = len(train)
all_data = pd.concat([train, test])

In [5]:
all_data.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,1309.0,1309.0,1046.0,1309.0,1309.0,1308.0
mean,655.0,2.294882,29.881138,0.498854,0.385027,33.295479
std,378.020061,0.837836,14.413493,1.041658,0.86556,51.758668
min,1.0,1.0,0.17,0.0,0.0,0.0
25%,328.0,2.0,21.0,0.0,0.0,7.8958
50%,655.0,3.0,28.0,0.0,0.0,14.4542
75%,982.0,3.0,39.0,1.0,0.0,31.275
max,1309.0,3.0,80.0,8.0,9.0,512.3292


In [6]:
all_data.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,1309,1309,1309,295,1307
unique,1307,2,929,186,3
top,"Kelly, Mr. James",male,CA. 2343,C23 C25 C27,S
freq,2,843,11,6,914


In [7]:
# Populating null fare value with median of train set
all_data["Fare"]=all_data["Fare"].fillna(train["Fare"].median())

In [8]:
# Populating null age value with median of train set
all_data["Age"]=all_data["Age"].fillna(train["Age"].median())

In [9]:
# Populating missing embarked with most frequent value - S
all_data["Embarked"]=all_data["Embarked"].fillna("S")

In [10]:
# Drop cabin due to too many null values
all_data.drop(["Cabin"], axis=1, inplace=True)

In [11]:
from sklearn import preprocessing 
#convert objects / non-numeric data types into numeric
for f in all_data.columns:
    if all_data[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(all_data[f].values)) 
        all_data[f] = lbl.transform(list(all_data[f].values))

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree

In [13]:
X_train = all_data[:num_train]
X_test = all_data[num_train:]

In [14]:
X_train, X_cv, y_train, y_cv = train_test_split( X_train, Y_train, test_size = 0.2, random_state = 100)

In [15]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=5,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=100, splitter='best')

In [16]:
# Graphviz is used to build decision trees
from sklearn.tree import export_graphviz
from sklearn import tree

In [17]:
# This statement builds a dot file.
cols = list(X_train.columns.values)
tree.export_graphviz(clf_gini, out_file='treewithname.dot',feature_names  = cols)  

In [18]:
y_pred = clf_gini.predict(X_cv)

In [19]:
def score_in_percent (a,b):
    return (sum(a==b)*100)/len(a)

In [20]:
score_in_percent(y_pred,y_cv)

79.888268156424587

In [21]:
# Let's do little tweak, and drop few features
X_train.drop(["Name","Ticket","PassengerId"], axis=1, inplace=True)
X_cv.drop(["Name","Ticket","PassengerId"], axis=1, inplace=True)
X_test.drop(["Name","Ticket","PassengerId"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [22]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
y_pred = clf_gini.predict(X_cv)
score_in_percent(y_pred,y_cv)

81.005586592178773

In [23]:
# This statement builds a dot file.
cols = list(X_train.columns.values)
tree.export_graphviz(clf_gini, out_file='treewithlargetrain.dot',feature_names  = cols) 

In [24]:
y_test_pred = clf_gini.predict(X_test)

In [25]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_test_pred
    })
submission.to_csv('submission_with_3depth_larger_train.csv', index=False) # LB : 0.78469

In [26]:
# Let's do some feature engineering now and add few features
# Adding total family size and droppign parch and sibsp.
X_train['FamSi'] = X_train["SibSp"] + X_train["Parch"] 
X_cv['FamSi']    = X_cv["SibSp"] + X_cv["Parch"] 
X_test['FamSi']    = X_test["SibSp"] + X_test["Parch"] 
X_train.drop(["SibSp","Parch"], axis=1, inplace=True)
X_cv.drop(["SibSp","Parch"], axis=1, inplace=True)
X_test.drop(["SibSp","Parch"], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [34]:
clf_gini = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf_gini.fit(X_train, y_train)
y_pred = clf_gini.predict(X_cv)
score_in_percent(y_pred,y_cv)

81.005586592178773

In [28]:
# This statement builds a dot file.
cols = list(X_train.columns.values)
tree.export_graphviz(clf_gini, out_file='treewithlargetrainfemsi.dot',feature_names  = cols) 

In [29]:
submission = pd.DataFrame({
        "PassengerId": test["PassengerId"],
        "Survived": y_test_pred
    })
submission.to_csv('submission_with_3depth_larger_train_famsi.csv', index=False) # LB : 0.74163

In [30]:
X_train.to_csv("X_train.csv", index=False)

In [31]:
X_subset = pd.concat([X_train, y_train],axis=1)

In [32]:
X_subset.to_csv("X_subset.csv", index=False)