In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv("../input/titanic/train.csv")
test_data = pd.read_csv("../input/titanic/test.csv")

In [None]:
train_data.info()

In [None]:
train_data.dtypes

In [None]:
train_data["Age"].fillna(train_data["Age"].mean(), inplace=True)
test_data["Age"].fillna(test_data["Age"].mean(), inplace=True)

In [None]:
train_data = train_data.drop(columns=["Name"])
train_data["Sex"] = train_data["Sex"].map({"male" : 1, "female" : 0})
train_data["Embarked"].fillna(train_data["Embarked"].mode()[0], inplace=True)


test_data = test_data.drop(columns=["Name"])
test_data["Sex"] = test_data["Sex"].map({"male" : 1, "female" : 0})
test_data["Embarked"].fillna(test_data["Embarked"].mode()[0], inplace=True)

train_data.Embarked.isna().sum()


In [None]:
train_data = train_data.join(pd.get_dummies(train_data["Embarked"], prefix="Embarked"))
train_data.drop(["Embarked", "Ticket"], axis=1, inplace=True)

test_data = test_data.join(pd.get_dummies(test_data["Embarked"], prefix="Embarked"))
test_data.drop(["Embarked", "Ticket"], axis=1, inplace=True)

In [None]:
train_data["cabin_letters"] = train_data.Cabin.apply(lambda x: str(x)[0])
test_data["cabin_letters"] = test_data.Cabin.apply(lambda x: str(x)[0])


In [None]:
train_data = train_data.join(pd.get_dummies(train_data['cabin_letters'], prefix="cabin_letters"))
train_data.drop(["Cabin", "cabin_letters"], axis=1, inplace=True)

test_data = test_data.join(pd.get_dummies(test_data['cabin_letters'], prefix="cabin_letters"))
test_data.drop(["Cabin", "cabin_letters"], axis=1, inplace=True)




In [None]:
print(train_data.isna().sum())
print(test_data.isna().sum())

In [None]:
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)
print(test_data.isna().sum())

Scaling and standardization of data

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

features = [ 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'cabin_letters_A',
       'cabin_letters_B', 'cabin_letters_C', 'cabin_letters_D',
       'cabin_letters_E', 'cabin_letters_F', 'cabin_letters_G',] 

train_y = train_data["Survived"]
train_data.drop("Survived", axis=1, inplace=True)

train_data_scaled = ss.fit_transform(train_data[features])
test_data_scaled = ss.transform(test_data[features])

Model building

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.svm import SVC

In [None]:
lr = LogisticRegression()
cv = cross_val_score(lr,train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

In [None]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

In [None]:
rfc = RandomForestClassifier(random_state=1)
cv = cross_val_score(rfc,train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

In [None]:
dt = tree.DecisionTreeClassifier(random_state=1)
cv = cross_val_score(dt, train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

In [None]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
cv = cross_val_score(xgb, train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())


In [None]:
svc = SVC(probability=True)
cv = cross_val_score(svc, train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('lr', lr), ('knn', knn), ('dt', dt), ('rfc', rfc), ('xgb', xgb), ('svc', svc)], voting='soft')

In [None]:
cv = cross_val_score(voting_clf, train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

In [None]:
voting_clf.fit(train_data_scaled, train_y)
predictions = voting_clf.predict(test_data_scaled).astype(int)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)