In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [57]:
df=pd.read_csv('train.csv')

In [58]:
df.head()
plt.scatter()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


Data wrangling

1) Null values 
    
    Embarked - add mode at null 
    Age - add median value for that pclass
    cabin - either it can be removed or added as per the pclass
    
2) Classification of categorical columns(get_dummies)
    sex,embarked,pclass
    
3) remove passengerid, ticket and Name for time being    


In [17]:
df.groupby('Pclass')['Age'].median()

Pclass
1    37.0
2    29.0
3    24.0
Name: Age, dtype: float64

In [46]:
# funtion to add median value for null value in age col.
def add_age(cols):
    Age = cols[0]
    pclass= cols[1]
    if pd.isnull(Age):
        if pclass == 1:
            return 37
        elif pclass == 2:
            return 29
        else:
            return 24
    else:
        return Age
        


In [59]:
df['Age']=df[['Age','Pclass']].apply(add_age,axis=1)

In [68]:
df.groupby('Embarked').count().iloc[:,1]

Embarked
C    168
Q     77
S    644
Name: Survived, dtype: int64

In [77]:
# added 'S' for null value
df.loc[df.Embarked.isnull(),'Embarked']='S'

In [80]:
# dropped not so important columns
df.drop(columns=['Cabin','Ticket','PassengerId'],inplace=True)

In [84]:
#taking out the first salutation from name
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.')

In [85]:
mapping = {'Mr':0, 'Mrs':1, 'Miss':2, 'Master':3, 'Don':3, 'Rev':3, 'Dr':3, 'Mme':3, 'Ms':3,
       'Major':3, 'Lady':3, 'Sir':3, 'Mlle':3, 'Col':3, 'Capt':3, 'Countess':3,
       'Jonkheer':3}
df['Title']=df['Title'].map(mapping)

In [86]:
# getdummies
df = pd.get_dummies(df,columns=['Sex','Embarked','Pclass'],drop_first=True)

In [87]:
df.columns

Index(['Survived', 'Name', 'Age', 'SibSp', 'Parch', 'Fare', 'Title',
       'Sex_male', 'Embarked_Q', 'Embarked_S', 'Pclass_2', 'Pclass_3'],
      dtype='object')

In [88]:
# dropping all unwanted col
df.drop(columns=['Name'],inplace=True)

In [97]:
df.head()

Unnamed: 0,Survived,Age,SibSp,Parch,Fare,Title,Sex_male,Embarked_Q,Embarked_S,Pclass_2,Pclass_3
0,0,22.0,1,0,7.25,0,1,0,1,0,1
1,1,38.0,1,0,71.2833,1,0,0,0,0,0
2,1,26.0,0,0,7.925,2,0,0,1,0,1
3,1,35.0,1,0,53.1,1,0,0,1,0,0
4,0,35.0,0,0,8.05,0,1,0,1,0,1


In [106]:
from sklearn.model_selection import cross_val_predict,cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report

In [98]:
X = df.drop('Survived',axis = 1)
y = df['Survived']

In [101]:
lr = LogisticRegression()

# Cross validation with Logistic regression

In [103]:
# cross validation algorithm with cross_val_predict
y_pred = cross_val_predict(lr,X,y,cv=10)



In [104]:
print(confusion_matrix(y,y_pred))

[[480  69]
 [102 240]]


In [105]:
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.87      0.85       549
           1       0.78      0.70      0.74       342

   micro avg       0.81      0.81      0.81       891
   macro avg       0.80      0.79      0.79       891
weighted avg       0.81      0.81      0.81       891



In [108]:
cross_val_score(lr,X,y,cv=10).mean()



0.8080975485188968

# df, X, y are intact - now will try for normal random forest and SVN

In [110]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [111]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [112]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [114]:
rf_pred=rf.predict(X_test)

In [115]:
print(classification_report(y_test,rf_pred))
print(confusion_matrix(y_test,rf_pred))


              precision    recall  f1-score   support

           0       0.81      0.86      0.83       175
           1       0.77      0.71      0.74       120

   micro avg       0.80      0.80      0.80       295
   macro avg       0.79      0.78      0.79       295
weighted avg       0.80      0.80      0.80       295

[[150  25]
 [ 35  85]]


# SVM

In [116]:
# SVC
svc = SVC()
svc.fit(X_train,y_train)




SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [117]:
svc_pred = svc.predict(X_test)

In [118]:
print(classification_report(y_test,svc_pred))
print(confusion_matrix(y_test,svc_pred))


              precision    recall  f1-score   support

           0       0.72      0.89      0.80       175
           1       0.76      0.50      0.60       120

   micro avg       0.73      0.73      0.73       295
   macro avg       0.74      0.70      0.70       295
weighted avg       0.74      0.73      0.72       295

[[156  19]
 [ 60  60]]


# Logistic Regression

In [120]:
lr2 = LogisticRegression()

In [121]:
lr2.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [122]:
lr2_pred = lr2.predict(X_test)

In [123]:
print(classification_report(y_test,lr2_pred))
print(confusion_matrix(y_test,lr2_pred))


              precision    recall  f1-score   support

           0       0.83      0.86      0.85       175
           1       0.79      0.75      0.77       120

   micro avg       0.82      0.82      0.82       295
   macro avg       0.81      0.81      0.81       295
weighted avg       0.82      0.82      0.82       295

[[151  24]
 [ 30  90]]


In [130]:
lr2_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0,
       0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 1, 1, 0], dtype=int64)

In [131]:
df_test_final = pd.read_csv('test_clean.csv')

In [151]:
df_pasengerid = pd.read_csv('passengerid.csv',names = ['PassengerId'])

In [137]:
df_test_final.drop(columns='Unnamed: 0',inplace = True)

In [138]:
X.columns

Index(['Age', 'SibSp', 'Parch', 'Fare', 'Title', 'Sex_male', 'Embarked_Q',
       'Embarked_S', 'Pclass_2', 'Pclass_3'],
      dtype='object')

In [139]:
df_test_final.columns


Index(['Age', 'SibSp', 'Parch', 'Fare', 'Title', 'Sex_male', 'Embarked_Q',
       'Embarked_S', 'Pclass_2', 'Pclass_3'],
      dtype='object')

In [140]:
kaggle_final_pred = lr2.predict(df_test_final)

In [167]:
df_pasengerid.head()

Unnamed: 0,PassengerId
0,892
1,893
2,894
3,895
4,896


In [159]:
kaggle_final_pred

array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,

In [163]:
kaggle_final = pd.DataFrame(kaggle_final_pred, columns=['Survived_result'])

In [176]:
kaggle_final.to_csv('result.csv')

In [173]:
kaggle_upload = pd.concat([df_pasengerid,kaggle_final],ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [174]:
kaggle_upload

Unnamed: 0,PassengerId,Survived_result
0,892.0,
1,893.0,
2,894.0,
3,895.0,
4,896.0,
5,897.0,
6,898.0,
7,899.0,
8,900.0,
9,901.0,
