In [1]:
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier


from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
%matplotlib inline

In [2]:
importeddata=pd.read_csv('train.csv')

In [3]:

# PREPROCESS DATA
def preprocessdata(rawdata):
    rawdata.loc[:,'Age']=rawdata[['Age']].fillna(rawdata[['Age']].mean())
    rawdata.loc[:,'Fare']=rawdata[['Fare']].fillna(rawdata[['Fare']].mean())
    rawdata=rawdata.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
    rawdata=pd.get_dummies(rawdata,columns=['Pclass','Sex','Embarked','Age'])
    return rawdata

# Train Test Split
rawdata=preprocessdata(importeddata)
X_train, X_test, y_train, y_test = train_test_split(
    # features
    rawdata[[col for col in rawdata.columns if col != 'Survived']],
    # targets
    rawdata.Survived, test_size=0.4, random_state=0)



In [4]:
# Create a Models dictionary to house the parameter data for the different params
MODELS=[
    {
        'model': svm.SVC(),
        'params':{
            'C':[0.001,0.01,0.1,1],
            'kernel':['linear']},
         'name':'SVC'
              },
    
    {
        'model': linear_model.LogisticRegression(),
        'params':{
        'C':[0.001,0.01,0.1,1]},
        'name':'Logistic'
              },
    
    {
        'model': tree.DecisionTreeClassifier(),
        'params':{'criterion':['gini','entropy'],
        'max_depth':[None,3,10,25],},
        'name':'DecisionTree'
              },
    
        {
        'model': RandomForestClassifier(),
        'params':{'n_estimators':[10,50,100]},
        'name':'DecisionTree'
              },
 
            {
        'model': KNeighborsClassifier(),
        'params':{'n_neighbors':[3,5]},
        'name':'KNeighbors'
              },
    
]

In [5]:
def GridSearchModelling(X_train,y_train,X_test,y_test):
    for model in MODELS:
        gsmodel=GridSearchCV(estimator=model['model'],n_jobs=1,cv=10,param_grid=model['params'],verbose=1)
        gsmodel.fit(X_train,y_train)
        print(model['name'],' Score:',gsmodel.score(X_test,y_test))
        print(gsmodel.best_params_)

In [6]:
GridSearchModelling(X_train, y_train, X_test, y_test)

Fitting 10 folds for each of 4 candidates, totalling 40 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:   32.2s finished


SVC  Score: 0.7787114845938375
{'C': 1, 'kernel': 'linear'}
Fitting 10 folds for each of 4 candidates, totalling 40 fits
Logistic  Score: 0.7955182072829131
{'C': 0.1}
Fitting 10 folds for each of 8 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.4s finished


DecisionTree  Score: 0.7899159663865546
{'criterion': 'entropy', 'max_depth': 3}
Fitting 10 folds for each of 3 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    2.6s finished


DecisionTree  Score: 0.8011204481792717
{'n_estimators': 100}
Fitting 10 folds for each of 2 candidates, totalling 20 fits
KNeighbors  Score: 0.7675070028011205
{'n_neighbors': 3}


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.3s finished


In [7]:
model=tree.DecisionTreeClassifier(criterion='gini',max_depth=3)
model.fit(X_train, y_train)


DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [8]:
testsubmitrawdata=pd.read_csv('test.csv')
testsubmitdata=preprocessdata(testsubmitrawdata)
# testsubmitdata=testsubmitdata.drop(['Parch_9'],axis=1)

In [9]:
model.predict(testsubmitdata)

ValueError: Number of features of the model must match the input. Model n_features is 100 and input n_features is 91 

In [None]:
outputdf=pd.DataFrame(
    {
    'PassengerId':testsubmitrawdata.PassengerId,
    'Survived':model.predict(testsubmitdata)
    })

In [None]:
outputdf.to_csv('results.csv',index=False)