In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)



In [5]:
titanic_train = pd.read_csv("train.csv")
titanic_train.tail()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.45,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0,C148,C
890,891,0,3,"Dooley, Mr. Patrick",male,32.0,0,0,370376,7.75,,Q


In [6]:
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


## Investigating Linear Correlations
Positive correlation of **survived** with **fare** and negative correlation of **survived** with **Pclass**(Passenger Class) shows that passengers with high fare and 1 or 2 class have better chance of survival compared to lower fare or 3 class.  
**But we cannot make conclusions with just Linear Correlation we need to investigate more.**

In [7]:
corr_titanic = titanic_train.corr()
corr_titanic["Survived"].sort_values(ascending=False)

Survived       1.000000
Fare           0.257307
Parch          0.081629
PassengerId   -0.005007
SibSp         -0.035322
Age           -0.077221
Pclass        -0.338481
Name: Survived, dtype: float64

## Dropping irrelevant columns

In [8]:
titanic_train_new = titanic_train.drop(["Name","Ticket","PassengerId","Cabin"], axis=1)
titanic_train_new.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [9]:
from sklearn.preprocessing import LabelEncoder

encoder_sex = LabelEncoder()
encoded_sex = encoder_sex.fit_transform(titanic_train_new["Sex"])
encoded_sex_dict = dict(zip([0,1], encoder_sex.classes_)) #Can be used for mapping later

dict_embarked = {"C":1, "Q": 2, "S":3}

titanic_train_new["encoded_embarked"] = titanic_train_new["Embarked"].map(dict_embarked)
titanic_train_new["encoded_sex"] = encoded_sex

titanic_train_final = titanic_train_new.drop(["Sex","Embarked"], axis=1)

from sklearn.preprocessing import Imputer

imputer = Imputer(strategy="median")
titanic_train_finaly  = imputer.fit_transform(titanic_train_final)
titanic_train = pd.DataFrame(titanic_train_finaly, columns=titanic_train_final.columns)

def null_checker(data):
    column_list = data.columns
    Null_dict = {}   
    for column in column_list:
        Null_value_count = data[data[column].isnull() == True].count()[1]
        Null_dict[column] = Null_value_count
    
    print(Null_dict)
        
null_checker(titanic_train)  
#titanic_train_final.fillna(0,inplace=True)

{'Survived': 0, 'Pclass': 0, 'Age': 0, 'SibSp': 0, 'Parch': 0, 'Fare': 0, 'encoded_embarked': 0, 'encoded_sex': 0}


In [10]:
X = titanic_train.drop("Survived", axis=1)
y = titanic_train["Survived"]
X.shape

(891, 7)

In [11]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Machine Learning

### K Nearest Neighbor

In [12]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

from sklearn.model_selection import cross_val_score
scores_knn = cross_val_score(knn, X, y, cv=3, scoring="accuracy")

scores_knn.mean()

0.7037037037037037

Scores on standardized data

In [13]:
knn_2 = KNeighborsClassifier()

scores_knn_scaled = cross_val_score(knn_2, X_scaled, y, cv=3, scoring="accuracy")

scores_knn_scaled.mean()

0.7934904601571269

Grid searching best parameters

In [14]:
from sklearn.model_selection import GridSearchCV

param_grid = [{'n_neighbors':[3,5,7],'weights':['distance','uniform'],'p':[1,2]}]

grid_search = GridSearchCV(knn, param_grid, cv=3, scoring="accuracy")

grid_search = grid_search.fit(X_scaled, y)
grid_search.best_estimator_

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=7, p=1,
           weights='uniform')

In [15]:
knn_best = grid_search.best_estimator_

scores_knn_best = cross_val_score(knn_best, X_scaled, y, cv=3, scoring="accuracy")

scores_knn_best.mean()

0.8024691358024691

### Stochastic Gradient Descent

In [16]:
survived_train = (y==1)

from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier()

scores_sgd = cross_val_score(sgd, X_scaled, survived_train, cv=3, scoring="accuracy")

scores_sgd



array([0.67340067, 0.7003367 , 0.77441077])

### Random Forest Classifier

In [17]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()

scores_forest = cross_val_score(forest, X, y, cv=3, scoring="accuracy")

scores_forest

array([0.78451178, 0.78787879, 0.79124579])

In [18]:
forest_2 = RandomForestClassifier()

scores_forest_scaled = cross_val_score(forest_2, X_scaled, y, cv=3, scoring="accuracy")

scores_forest_scaled.mean()

0.7934904601571268

In [19]:
param_grid_forest = [{'n_estimators':[10,20,30],'max_features':['auto','log2',None],'criterion':['gini','entropy'],'bootstrap':[True, False]}]

grid_search_forest = GridSearchCV(forest, param_grid_forest, cv=3, scoring="accuracy")

grid_search_forest = grid_search_forest.fit(X_scaled, y)
grid_search_forest.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
forest_best = grid_search_forest.best_estimator_

scores_forest_best = cross_val_score(forest_best, X_scaled, y, cv=3, scoring="accuracy")

scores_forest_best.mean()

0.7934904601571269

In [21]:
test = pd.read_csv('test.csv')
test_new = test.drop(["Name","Ticket","PassengerId","Cabin"], axis=1)
test_new.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,3,male,34.5,0,0,7.8292,Q
1,3,female,47.0,1,0,7.0,S
2,2,male,62.0,0,0,9.6875,Q
3,3,male,27.0,0,0,8.6625,S
4,3,female,22.0,1,1,12.2875,S


In [22]:
encoder_sex = LabelEncoder()
encoded_sex = encoder_sex.fit_transform(test_new["Sex"])
#encoded_sex_dict = dict(zip([0,1], encoder_sex.classes_)) #Can be used for mapping later

dict_embarked = {"C":1, "Q": 2, "S":3}

test_new["encoded_embarked"] = test_new["Embarked"].map(dict_embarked)
test_new["encoded_sex"] = encoded_sex

test_new_final = test_new.drop(["Sex","Embarked"], axis=1)

imputer2 = Imputer(strategy="median")
test_imputed  = imputer2.fit_transform(test_new_final)

scaler2 = StandardScaler()
test_scaled = scaler2.fit_transform(test_imputed)
test_scaled.shape

(418, 7)

In [23]:
knn_best = knn_best.fit(X_scaled, y)
predictions_knn = knn_best.predict(test_scaled)
submission_knn = pd.DataFrame({
    'PassengerId':test['PassengerId'],
    'Survived':predictions_knn
})

#submission = pd.DataFrame([test["PassengerId"],predictions], columns=["PassengerId","Survived"])
#submission.columns = ["PassengerId","Survived"]
submission_knn.to_csv('av_Submission_knn.csv',index=False)

forest_best = forest_best.fit(X_scaled, y)
predictions_forest = forest_best.predict(test_scaled)
submission_forest = pd.DataFrame({
    'PassengerId':test['PassengerId'],
    'Survived':predictions_forest
})

#submission = pd.DataFrame([test["PassengerId"],predictions], columns=["PassengerId","Survived"])
#submission.columns = ["PassengerId","Survived"]
submission_forest.to_csv('av_Submission_forest.csv',index=False)

## Pickling best model

In [25]:
def pickler(name, model, predictions, training_data, training_label=None, cross_val_score=None, **kwargs):
    
    #Making dicitonary
    import numpy as np
    pickle_dict = {}
    pickle_name = name+".pickle"
    pickle_dict["ML Model"] = model
    pickle_dict["Predictions"] = np.array(predictions)
    pickle_dict["Training Data"] = np.array(training_data)
    pickle_dict["Training Labels"] = np.array(training_label)
    for key, value in kwargs.items():
        pickle_dict[key] = value
    
    #Pickling the dictionary
    from sklearn.externals import joblib
    joblib.dump(pickle_dict, pickle_name)

pickler("knn", knn_best, predictions_knn, test_scaled) 

In [24]:
gender = pd.read_csv('gender_submission.csv')
gender.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
