# CLASSIFICATION USING MULTIPLE MODELS

In [1]:
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score

In [2]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

In [3]:
titanic_df = pd.read_csv('./datasets/titanic_processed.csv')

In [4]:
titanic_df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked_C,Embarked_Q,Embarked_S
0,1,2,0,23.0,0,0,13.7917,1,0,0
1,0,1,0,25.0,1,2,151.55,0,0,1
2,0,3,1,22.0,0,0,7.7958,0,0,1
3,0,3,0,6.0,4,2,31.275,0,0,1
4,1,3,0,38.0,1,5,31.3875,0,0,1


In [5]:
FEATURES = list(titanic_df.columns[1:]) # extract features except Survived
FEATURES

['Pclass',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Fare',
 'Embarked_C',
 'Embarked_Q',
 'Embarked_S']

In [6]:
result_dict = {}

In [7]:
def summarize(y_test,y_pred):
    acc = accuracy_score(y_test,y_pred,normalize=True)      # op :fraction
    num_acc = accuracy_score(y_test,y_pred,normalize=False) # op :number
    
    precis = precision_score(y_test,y_pred)
    recall = recall_score(y_test,y_pred)
    
    return {'Accuracy_Count':num_acc,
            'Accuracy' : acc,
            'Precision': precis,
            'Recall'   : recall}

In [8]:
#write build model helper function

def build_model(classifier_fn,
               name_of_y_col,
               name_of_x_cols,
               dataset,
               test_fract=0.2):
    
    X = dataset[name_of_x_cols]
    Y = dataset[name_of_y_col]
    
    x_train,x_test,y_train,y_test = train_test_split(X,Y, test_size = test_fract) 
    
    model = classifier_fn(x_train,y_train)
    
    y_pred_train  = model.predict(x_train)
    y_pred = model.predict(x_test)
    
    train_summary = summarize(y_train,y_pred_train)
    test_summary  = summarize(y_test,y_pred)
    
    pred_results = pd.DataFrame({ 'y_test': y_test,
                                  'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred,pred_results.y_test)
    
    return {'Training':train_summary,
            'Test':test_summary,
            'Confusion_Matrix':model_crosstab}
    
    

In [9]:
def compare_results():
    for key in result_dict:
        print('Classification :',key)
        
        print()
        print('Training Data')
        for score in result_dict[key]['Training']:
            print(score,result_dict[key]['Training'][score])
            
        print()
        print('Test Data')
        for score in result_dict[key]['Test']:
            print(score,result_dict[key]['Test'][score])
        
        print()

# LOGISTIC REGRESSION

In [10]:
def logistic_fn(x_train,y_train):
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train,y_train)
    
    return model

In [11]:
result_dict['survived-logistic'] = build_model(logistic_fn,
                                              'Survived',
                                               FEATURES,
                                               titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387



# LDA

In [12]:
# uses svd - Singular Value Decomposition : uses by default
# svd find finds the best axis w-out calculating the covariance of matrix of features
# using covariance matrix is extremely useful when we have many featurtes


def linear_discriminant_fn(x_train,y_train,solver='svd'):
    
    model = LinearDiscriminantAnalysis(solver=solver)
    model.fit(x_train,y_train)
    
    return model

In [13]:
result_dict['survived-LDA'] = build_model(linear_discriminant_fn,
                                         'Survived',
                                         FEATURES,
                                         titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387

Classification : survived-LDA

Training Data
Accuracy_Count 452
Accuracy 0.7943760984182777
Precision 0.7808219178082192
Recall 0.7125

Test Data
Accuracy_Count 116
Accuracy 0.8111888111888111
Precision 0.7058823529411765
Recall 0.75





##### ("Variables are collinear.") Warning :

    The input features contains onehot encodes values. 
    For many machine learning model if you include all of the one hot encoded features in your training data,you will encounter something that is called a dummy trap.
    
    
    This occurs when there is a perfect colinearity b/w two or more features in your training dataset and this dummy trap can result in poor ML models.
    
    
    The way to fix this is to use dummy encoding of our categorical variables instead of one-hot encoding.This can be done very easily by dropping one of the columns from our one hot encoded set, That's exactly what we'll do here.
     

In [14]:
# instead of using all of our training features lets drop the last of the one-hot encoded columns

result_dict['survived-LDA'] = build_model(linear_discriminant_fn,
                                         'Survived',
                                         FEATURES[0:-1],
                                         titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387

Classification : survived-LDA

Training Data
Accuracy_Count 453
Accuracy 0.7961335676625659
Precision 0.7666666666666667
Recall 0.706140350877193

Test Data
Accuracy_Count 112
Accuracy 0.7832167832167832
Precision 0.7636363636363637
Recall 0.7



The warning is gone.

Certain estimators automatically takes care of this . Log_Reg Estimator API automatically takes care of this.

# QDA

Find bext axes to seperate the classes such that all instances of a class are in the same quadrant but the decision boundary is quadratic . ie it is not necessarily a straight line(like in LDA)...could be a curve.

Useful when the X variable corresponding to different labels have different covariances i.e covariances are different for X for all values of Y.

If you analyze your data and find that its true, then QDA will work fine.

In [15]:
def quadratic_discriminant_fn(x_train,y_train):
    
    model = QuadraticDiscriminantAnalysis()
    model.fit(x_train,y_train)
    
    return model

this is also prone to the dummy trap, so we dont have have to pass in all one hot encoded features

In [16]:
result_dict['survived-QDA'] = build_model(quadratic_discriminant_fn,
                                         'Survived',
                                         FEATURES[0:-1],
                                         titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387

Classification : survived-LDA

Training Data
Accuracy_Count 453
Accuracy 0.7961335676625659
Precision 0.7666666666666667
Recall 0.706140350877193

Test Data
Accuracy_Count 112
Accuracy 0.7832167832167832
Precision 0.7636363636363637
Recall 0.7

Classification : survived-QDA

Training Data
Accuracy_Count 454
Accuracy 0.7978910369068541
Precision 0.757847533632287
Recall 0.7347826086956522

Test Data
Accuracy_Count 110
Accuracy 0.7692307692307693
Precision 0.7272727272727273
Recall 0.6896551724137931



QDA peforms a bit better when compared to LDA, even though the different is not so significant.

### SGD Classifier

In [33]:
def sgd_fn(x_train,y_train,max_iter = 10000,tol=1e-3):
    model= SGDClassifier(max_iter=max_iter,tol =tol)
    model.fit(x_train,y_train)
    
    return model

max_iter = no of iterations to which the model should train.
tol = tolerance refers to stoping criterion for training. The model will stop training if the loss calculated at a particular
iteration is less than the tolerance value that we have specified. When the value falls below the tol value the model isn't improving.


In [34]:
result_dict['survived-SGD'] = build_model(sgd_fn,
                                         'Survived',
                                         FEATURES,
                                         titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387

Classification : survived-LDA

Training Data
Accuracy_Count 453
Accuracy 0.7961335676625659
Precision 0.7666666666666667
Recall 0.706140350877193

Test Data
Accuracy_Count 112
Accuracy 0.7832167832167832
Precision 0.7636363636363637
Recall 0.7

Classification : survived-QDA

Training Data
Accuracy_Count 454
Accuracy 0.7978910369068541
Precision 0.757847533632287
Recall 0.7347826086956522

Test Data
Accuracy_Count 110
Accuracy 0.7692307692307693
Precision 0.7272727272727273
Recall 0.6896551724137931

Classification : survived-SGD

Training Data
Accuracy_Count 424
Accuracy 0.7451669595782073
Precision 0.6605839416058394
Recall 0.776824034334764

Test Data
Accuracy_Count 116
Accuracy 0.8111888111888111
Precision 0.7333333333333

Fairly low accuracy

After increasing the accuracy we got 81% acc on test data which is good

### Linear SVC

In [35]:
def linear_svc_fn(x_train,y_train,c=1.0,max_iter =100, tol =1e-3):
    model = LinearSVC(C=c,max_iter=max_iter,tol=tol,dual=False)
    model.fit(x_train,y_train)
    
    return model

In [36]:
result_dict['survived-LinearSVC'] = build_model(linear_svc_fn,
                                         'Survived',
                                         FEATURES,
                                         titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387

Classification : survived-LDA

Training Data
Accuracy_Count 453
Accuracy 0.7961335676625659
Precision 0.7666666666666667
Recall 0.706140350877193

Test Data
Accuracy_Count 112
Accuracy 0.7832167832167832
Precision 0.7636363636363637
Recall 0.7

Classification : survived-QDA

Training Data
Accuracy_Count 454
Accuracy 0.7978910369068541
Precision 0.757847533632287
Recall 0.7347826086956522

Test Data
Accuracy_Count 110
Accuracy 0.7692307692307693
Precision 0.7272727272727273
Recall 0.6896551724137931

Classification : survived-SGD

Training Data
Accuracy_Count 424
Accuracy 0.7451669595782073
Precision 0.6605839416058394
Recall 0.776824034334764

Test Data
Accuracy_Count 116
Accuracy 0.8111888111888111
Precision 0.7333333333333

### Radius Neighbour

In [37]:
def radius_neighbour_fn(x_train,y_train,radius=40.0):
    model = RadiusNeighborsClassifier(radius=radius)
    model.fit(x_train,y_train)
    
    return model

In [38]:
result_dict['survived-Radius-Neighbour'] = build_model(radius_neighbour_fn,
                                         'Survived',
                                         FEATURES,
                                         titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387

Classification : survived-LDA

Training Data
Accuracy_Count 453
Accuracy 0.7961335676625659
Precision 0.7666666666666667
Recall 0.706140350877193

Test Data
Accuracy_Count 112
Accuracy 0.7832167832167832
Precision 0.7636363636363637
Recall 0.7

Classification : survived-QDA

Training Data
Accuracy_Count 454
Accuracy 0.7978910369068541
Precision 0.757847533632287
Recall 0.7347826086956522

Test Data
Accuracy_Count 110
Accuracy 0.7692307692307693
Precision 0.7272727272727273
Recall 0.6896551724137931

Classification : survived-SGD

Training Data
Accuracy_Count 424
Accuracy 0.7451669595782073
Precision 0.6605839416058394
Recall 0.776824034334764

Test Data
Accuracy_Count 116
Accuracy 0.8111888111888111
Precision 0.7333333333333

Radius neighbour did't do well on this dataset . the accuracy is 67% and its possible that that the value of radius that we used is not really right.

Which values of these hyperparameters work well for a particular ML model on a given dataset is something that we only know by evaluating a number of different models 

This process is referred to a s Hyperparameter tuning.

### Decision Trees

In [41]:
  def decision_tree_fn(x_train,y_train,max_depth=None,max_features=None):
        model = DecisionTreeClassifier(max_depth=max_depth,max_features=max_features)
        model.fit(x_train,y_train)
        
        return model

In [42]:
result_dict['survived-DecisionTress'] = build_model(decision_tree_fn,
                                         'Survived',
                                         FEATURES,
                                         titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387

Classification : survived-LDA

Training Data
Accuracy_Count 453
Accuracy 0.7961335676625659
Precision 0.7666666666666667
Recall 0.706140350877193

Test Data
Accuracy_Count 112
Accuracy 0.7832167832167832
Precision 0.7636363636363637
Recall 0.7

Classification : survived-QDA

Training Data
Accuracy_Count 454
Accuracy 0.7978910369068541
Precision 0.757847533632287
Recall 0.7347826086956522

Test Data
Accuracy_Count 110
Accuracy 0.7692307692307693
Precision 0.7272727272727273
Recall 0.6896551724137931

Classification : survived-SGD

Training Data
Accuracy_Count 424
Accuracy 0.7451669595782073
Precision 0.6605839416058394
Recall 0.776824034334764

Test Data
Accuracy_Count 116
Accuracy 0.8111888111888111
Precision 0.7333333333333

Note that it Performs exceedingly well during training. Observe traing accuracy is 98%
and Test accuracy is just 73%.

Dt cn be extremely complex if you dont specify the constraints on size and shape of the tree.. It is very possible for this model to overfit on the training data such that the resulting  model performs very well during training but poorly or real data. 

Even though it gives us 73% accuracy , its clearly overfitted and not  a good model

### Navie Bayes

In [43]:
def naive_bayes_fn(x_train,y_train,priors=None):
    
    model = GaussianNB(priors=priors)
    model.fit(x_train,y_train)
    
    return model

In [44]:
result_dict['survived-NaiveBayes'] = build_model(naive_bayes_fn,
                                         'Survived',
                                         FEATURES,
                                         titanic_df)
compare_results()

Classification : survived-logistic

Training Data
Accuracy_Count 463
Accuracy 0.8137082601054482
Precision 0.8061224489795918
Recall 0.6991150442477876

Test Data
Accuracy_Count 107
Accuracy 0.7482517482517482
Precision 0.7096774193548387
Recall 0.7096774193548387

Classification : survived-LDA

Training Data
Accuracy_Count 453
Accuracy 0.7961335676625659
Precision 0.7666666666666667
Recall 0.706140350877193

Test Data
Accuracy_Count 112
Accuracy 0.7832167832167832
Precision 0.7636363636363637
Recall 0.7

Classification : survived-QDA

Training Data
Accuracy_Count 454
Accuracy 0.7978910369068541
Precision 0.757847533632287
Recall 0.7347826086956522

Test Data
Accuracy_Count 110
Accuracy 0.7692307692307693
Precision 0.7272727272727273
Recall 0.6896551724137931

Classification : survived-SGD

Training Data
Accuracy_Count 424
Accuracy 0.7451669595782073
Precision 0.6605839416058394
Recall 0.776824034334764

Test Data
Accuracy_Count 116
Accuracy 0.8111888111888111
Precision 0.7333333333333