Import Libraries
--------------------

In [0]:
import numpy as np 
import pandas as pd 
import sklearn 
from sklearn import preprocessing 
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from sklearn.metrics import accuracy_score,classification_report

Read Dataset
======

In [0]:
titanic_df = pd.read_csv('train (2).csv')
titanic_df.columns
#titanic_df.describe()

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

--------------------
Pre-Processing and conversion of data to numeric format
-------------

In [0]:
def data_preprocessing(df):
    #titanic.drop(['PassengerId', 'Name', 'Ticket', 'Cabin','Embarked'], 1, inplace=True)
    #print(titanic.tail())
    df = df.loc[:,['Survived','Pclass','Sex','Age','SibSp','Parch','Fare']]
    df.convert_objects(convert_numeric=True)
    df.fillna(0, inplace=True) 
    return df

In [0]:
def handle_non_numeric_data(df):
    columns = df.columns.values
    
    for column in columns:
        text_digit_vals = {}
        def convert_to_int(val):
            return text_digit_vals[val]
        
        if df[column].dtype != np.int64 and df[column].dtype != np.float64:
            column_contents = df[column].values.tolist()
            unique_elements = set(column_contents)
            
            x = 0            
            for unique in unique_elements:
                if unique not in text_digit_vals:
                    text_digit_vals[unique] = x
                    x+=1                    
            df[column] = list(map(convert_to_int, df[column]))
            
    return df

In [0]:
titanic_df1 = data_preprocessing(titanic_df)
print(titanic_df1.tail())

     Survived  Pclass     Sex   Age  SibSp  Parch   Fare
886         0       2    male  27.0      0      0  13.00
887         1       1  female  19.0      0      0  30.00
888         0       3  female   0.0      1      2  23.45
889         1       1    male  26.0      0      0  30.00
890         0       3    male  32.0      0      0   7.75


For all other conversions use the data-type specific converters pd.to_datetime, pd.to_timedelta and pd.to_numeric.
  """


In [0]:
titanic_df1 = handle_non_numeric_data(titanic_df1)
print(titanic_df1.tail())

     Survived  Pclass  Sex   Age  SibSp  Parch   Fare
886         0       2    0  27.0      0      0  13.00
887         1       1    1  19.0      0      0  30.00
888         0       3    1   0.0      1      2  23.45
889         1       1    0  26.0      0      0  30.00
890         0       3    0  32.0      0      0   7.75


In [0]:
X = np.array(titanic_df1.drop(['Survived'], 1).astype(float))
Y = np.array(titanic_df1['Survived'])

----------------
split dataset into train and test sets....
fit training data to Decision Tree Classifier
-----------

In [0]:
X_train, X_test, Y_train, Y_test = train_test_split( X, Y, test_size = 0.3, random_state = 100)
d_tree = DecisionTreeClassifier()
d_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

-----------------------
calculate accuracy
---------------

In [0]:
Y_pred = d_tree.predict(X_test)
print("Accuracy is ", accuracy_score(Y_test,Y_pred)*100)

Accuracy is  80.97014925373134


In [0]:
confusion = metrics.confusion_matrix(Y_test, Y_pred)

TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]
print("Accuracy calculation using confusion metrics : ", ((TP + TN) / float(TP + TN + FP + FN)))

Accuracy calculation using confusion metrics :  0.8097014925373134
