In [41]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, accuracy_score
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import  QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier


In [3]:
titanic_df = pd.read_csv('data/titanic/processed.csv')

titanic_df.head(4)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,0,2,1,27.0,0,0,26.0,0,0,1
1,1,2,0,30.0,3,0,21.0,0,0,1
2,0,2,1,23.0,0,0,13.0,0,0,1
3,1,3,0,4.0,0,2,22.025,0,0,1


In [4]:
FEATURES = titanic_df.columns[1:]

FEATURES

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked_C',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [5]:
def summ_clf(y_test, y_pred):
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)
    
    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc,
            'precision': prec,
            'recall' : recall,
            'accuracy_count' : num_acc}

In [6]:
def build_model(classifier_fn,
               name_of_y_col,
               name_of_x_cols,
               dataset,
               test_frac = 0.2):
    X = dataset[name_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = test_frac)
    
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    y_pred_train = model.predict(x_train)
    
    train_summary = summ_clf(y_train, y_pred_train)
    test_summary  = summ_clf(y_test, y_pred)
    
    pred_res = pd.DataFrame({"pred": y_pred, "actual": y_test})
    
    #confusion matrix
    
    model_crosstab = pd.crosstab(pred_res.pred, pred_res.actual)
    
    return {"training": train_summary,
            "test" : test_summary,
            "confusion_matrix": model_crosstab}
    

In [7]:
result_dict ={}

In [8]:
def compare_results():
    for key in result_dict:
        print('classification:' , key)
    
        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])
            
        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
            
        print()
            

In [9]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [10]:
result_dict['survived - logistic'] = build_model(logistic_fn,
                                                'Survived',
                                                FEATURES,
                                                titanic_df)

compare_results()

classification: survived - logistic

Training data
accuracy 0.7908611599297012
precision 0.7799043062200957
recall 0.690677966101695
accuracy_count 450

Test data
accuracy 0.8181818181818182
precision 0.8095238095238095
recall 0.6538461538461539
accuracy_count 117



In [11]:
def linear_discriminant_fn(x_train, y_train, solver = 'svd'):
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train, y_train)
    
    return model

In [12]:
result_dict['survived - linear_discriminant_analysis'] = build_model(linear_discriminant_fn,
                                                'Survived',
                                                FEATURES[0:-1],
                                                titanic_df)

compare_results()

classification: survived - logistic

Training data
accuracy 0.7908611599297012
precision 0.7799043062200957
recall 0.690677966101695
accuracy_count 450

Test data
accuracy 0.8181818181818182
precision 0.8095238095238095
recall 0.6538461538461539
accuracy_count 117

classification: survived - linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7828282828282829
recall 0.6888888888888889
accuracy_count 456

Test data
accuracy 0.7762237762237763
precision 0.746031746031746
recall 0.746031746031746
accuracy_count 111



In [13]:
def quad_discriminant_fn(x_train, y_train):
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train, y_train)
    
    return model

result_dict['survived - quad_discriminant_analysis'] = build_model(quad_discriminant_fn,
                                                'Survived',
                                                FEATURES[0:-1],
                                                titanic_df)

compare_results()

classification: survived - logistic

Training data
accuracy 0.7908611599297012
precision 0.7799043062200957
recall 0.690677966101695
accuracy_count 450

Test data
accuracy 0.8181818181818182
precision 0.8095238095238095
recall 0.6538461538461539
accuracy_count 117

classification: survived - linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7828282828282829
recall 0.6888888888888889
accuracy_count 456

Test data
accuracy 0.7762237762237763
precision 0.746031746031746
recall 0.746031746031746
accuracy_count 111

classification: survived - quad_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7424892703862661
recall 0.7456896551724138
accuracy_count 450

Test data
accuracy 0.8391608391608392
precision 0.8113207547169812
recall 0.7678571428571429
accuracy_count 120



In [24]:
def sgd_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    
    model = SGDClassifier(max_iter=max_iter, tol=tol)
    model.fit(x_train, y_train)
              
    return model

In [35]:
result_dict['survived - sgd'] = build_model(sgd_fn,
                                                'Survived',
                                                FEATURES[0:-1],
                                                titanic_df)


In [42]:
def linear_svc_fn(x_train, y_train, max_iter=10000, tol=1e-3):
    
    model = LinearSVC(max_iter=max_iter, tol=tol, dual=False)
    model.fit(x_train, y_train)
              
    return model

result_dict['survived - linear-svc'] = build_model(linear_svc_fn,
                                                'Survived',
                                                FEATURES[0:-1],
                                                titanic_df)


In [43]:
def radiusN_fn(x_train, y_train, radius=40.0):
    
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train, y_train)
              
    return model

result_dict['survived -radiusNN'] = build_model(radiusN_fn,
                                                'Survived',
                                                FEATURES[0:-1],
                                                titanic_df)

In [44]:
def KNN_fn(x_train, y_train, k=5):
    
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(x_train, y_train)
              
    return model

result_dict['survived -KNN'] = build_model(KNN_fn,
                                                'Survived',
                                                FEATURES[0:-1],
                                                titanic_df)

In [48]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_feature=None):
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_feature)
    model.fit(x_train, y_train)
    
    return model

result_dict['survived - decisionClf'] = build_model(decision_tree_fn,
                                                    'Survived',
                                                   FEATURES,
                                                   titanic_df)

In [51]:
def naive_bayes_fn(x_train, y_train, priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(x_train, y_train)
    
    return model

result_dict['survived - GaussianNB'] = build_model(naive_bayes_fn,
                                                    'Survived',
                                                   FEATURES,
                                                   titanic_df)

In [52]:
compare_results()

classification: survived - logistic

Training data
accuracy 0.7908611599297012
precision 0.7799043062200957
recall 0.690677966101695
accuracy_count 450

Test data
accuracy 0.8181818181818182
precision 0.8095238095238095
recall 0.6538461538461539
accuracy_count 117

classification: survived - linear_discriminant_analysis

Training data
accuracy 0.8014059753954306
precision 0.7828282828282829
recall 0.6888888888888889
accuracy_count 456

Test data
accuracy 0.7762237762237763
precision 0.746031746031746
recall 0.746031746031746
accuracy_count 111

classification: survived - quad_discriminant_analysis

Training data
accuracy 0.7908611599297012
precision 0.7424892703862661
recall 0.7456896551724138
accuracy_count 450

Test data
accuracy 0.8391608391608392
precision 0.8113207547169812
recall 0.7678571428571429
accuracy_count 120

classification: survived - sgd

Training data
accuracy 0.7398945518453427
precision 0.6433566433566433
recall 0.8
accuracy_count 421

Test data
accuracy 0.762237762