In [1]:
import numpy as np 
import pandas as pd 

In [2]:
train_data = pd.read_csv("../input/titanic/train.csv")
test_data = pd.read_csv("../input/titanic/test.csv")

In [3]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
train_data.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [5]:
train_data["Age"].fillna(train_data["Age"].mean(), inplace=True)
test_data["Age"].fillna(test_data["Age"].mean(), inplace=True)

In [6]:
train_data = train_data.drop(columns=["Name"])
train_data["Sex"] = train_data["Sex"].map({"male" : 1, "female" : 0})
train_data["Embarked"].fillna(train_data["Embarked"].mode()[0], inplace=True)


test_data = test_data.drop(columns=["Name"])
test_data["Sex"] = test_data["Sex"].map({"male" : 1, "female" : 0})
test_data["Embarked"].fillna(test_data["Embarked"].mode()[0], inplace=True)

train_data.Embarked.isna().sum()


0

In [7]:
train_data = train_data.join(pd.get_dummies(train_data["Embarked"], prefix="Embarked"))
train_data.drop(["Embarked", "Ticket"], axis=1, inplace=True)

test_data = test_data.join(pd.get_dummies(test_data["Embarked"], prefix="Embarked"))
test_data.drop(["Embarked", "Ticket"], axis=1, inplace=True)

In [8]:
train_data["cabin_letters"] = train_data.Cabin.apply(lambda x: str(x)[0])
test_data["cabin_letters"] = test_data.Cabin.apply(lambda x: str(x)[0])


In [9]:
train_data = train_data.join(pd.get_dummies(train_data['cabin_letters'], prefix="cabin_letters"))
train_data.drop(["Cabin", "cabin_letters"], axis=1, inplace=True)

test_data = test_data.join(pd.get_dummies(test_data['cabin_letters'], prefix="cabin_letters"))
test_data.drop(["Cabin", "cabin_letters"], axis=1, inplace=True)




In [10]:
print(train_data.isna().sum())
print(test_data.isna().sum())

PassengerId        0
Survived           0
Pclass             0
Sex                0
Age                0
SibSp              0
Parch              0
Fare               0
Embarked_C         0
Embarked_Q         0
Embarked_S         0
cabin_letters_A    0
cabin_letters_B    0
cabin_letters_C    0
cabin_letters_D    0
cabin_letters_E    0
cabin_letters_F    0
cabin_letters_G    0
cabin_letters_T    0
cabin_letters_n    0
dtype: int64
PassengerId        0
Pclass             0
Sex                0
Age                0
SibSp              0
Parch              0
Fare               1
Embarked_C         0
Embarked_Q         0
Embarked_S         0
cabin_letters_A    0
cabin_letters_B    0
cabin_letters_C    0
cabin_letters_D    0
cabin_letters_E    0
cabin_letters_F    0
cabin_letters_G    0
cabin_letters_n    0
dtype: int64


In [11]:
test_data['Fare'].fillna(test_data['Fare'].mean(), inplace=True)
print(test_data.isna().sum())

PassengerId        0
Pclass             0
Sex                0
Age                0
SibSp              0
Parch              0
Fare               0
Embarked_C         0
Embarked_Q         0
Embarked_S         0
cabin_letters_A    0
cabin_letters_B    0
cabin_letters_C    0
cabin_letters_D    0
cabin_letters_E    0
cabin_letters_F    0
cabin_letters_G    0
cabin_letters_n    0
dtype: int64


Scaling and standardization of data

In [12]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()

features = [ 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch',
       'Fare', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'cabin_letters_A',
       'cabin_letters_B', 'cabin_letters_C', 'cabin_letters_D',
       'cabin_letters_E', 'cabin_letters_F', 'cabin_letters_G',] 

train_y = train_data["Survived"]
train_data.drop("Survived", axis=1, inplace=True)

train_data_scaled = ss.fit_transform(train_data[features])
test_data_scaled = ss.transform(test_data[features])

Model building

In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.svm import SVC

In [14]:
lr = LogisticRegression()
cv = cross_val_score(lr,train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

[0.78212291 0.80337079 0.78089888 0.78089888 0.81460674]
0.7923796371853619


In [15]:
knn = KNeighborsClassifier()
cv = cross_val_score(knn,train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

[0.7877095  0.76404494 0.79775281 0.7752809  0.83707865]
0.7923733601154981


In [16]:
rfc = RandomForestClassifier(random_state=1)
cv = cross_val_score(rfc,train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

[0.77653631 0.78089888 0.83707865 0.7752809  0.85955056]
0.8058690603226413


In [17]:
dt = tree.DecisionTreeClassifier(random_state=1)
cv = cross_val_score(dt, train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

[0.72625698 0.76966292 0.79775281 0.76404494 0.79213483]
0.7699704977716403


In [18]:
from xgboost import XGBClassifier
xgb = XGBClassifier()
cv = cross_val_score(xgb, train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())


[0.78212291 0.80898876 0.85393258 0.79775281 0.84269663]
0.8170987383089573


In [19]:
svc = SVC(probability=True)
cv = cross_val_score(svc, train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

[0.7877095  0.80337079 0.79213483 0.80898876 0.83146067]
0.8047329106772958


In [20]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('lr', lr), ('knn', knn), ('dt', dt), ('rfc', rfc), ('xgb', xgb), ('svc', svc)], voting='soft')

In [21]:
cv = cross_val_score(voting_clf, train_data_scaled, train_y, cv=5)
print(cv)
print(cv.mean())

[0.80446927 0.80898876 0.85393258 0.80337079 0.85393258]
0.8249387985688281


In [22]:
voting_clf.fit(train_data_scaled, train_y)
predictions = voting_clf.predict(test_data_scaled).astype(int)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index=False)