In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, roc_curve

from sklearn.neighbors import KNeighborsRegressor

import scikitplot as skplt

In [12]:
train = pd.read_csv('../input/titanic/train.csv')
test = pd.read_csv('../input/titanic/test.csv')
train.head()

In [3]:
test.head()

In [13]:
target = train['Survived']
test_ids = test['PassengerId']

In [15]:
train1 = train.drop(['PassengerId', 'Survived'], axis= 1)
test1 = test.drop('PassengerId', axis=1)

In [16]:
data1 = pd.concat([train1, test1], axis= 0).reset_index(drop= True)
data1.head()

In [7]:
data1.isna().sum()

In [17]:
data1.drop(['Name', 'Ticket', 'Cabin'], axis= 1, inplace= True)

In [9]:
data1.head()

## **EDA**

In [18]:
sns.countplot(x = target, palette= 'RdPu')
plt.xlabel('Titanic Survival Rate');

So, it's clear from the above plot that majority of the people onboarding the titanic did not survived.

In [19]:
plt.figure(figsize= (16, 8))
sns.heatmap(data1.corr(), annot = True, cmap= 'YlGnBu', fmt= '.2f');

It seems like most of our independent varaibles are not correlated except `SibSp` and `Parch`. We will deal with that while doing feature engineering

In [20]:
sns.set_context('notebook', font_scale= 1.2)
fig, ax = plt.subplots(2, figsize = (20, 13))

plt.suptitle('Distribution of Age and Fair based on target variable', fontsize = 20)

# I am using the training dataset only to plot these as we don't have target variable in our test dataset
ax1 = sns.kdeplot(x ='Age', data= train, hue= 'Survived', shade= True, ax= ax[0], palette= 'twilight')
ax1.set(xlabel = 'Age', title= 'Distribution of Age based on target variable')

ax2 = sns.kdeplot(x ='Fare', data= train, hue= 'Survived', shade= True, ax= ax[1], palette= 'twilight')
ax2.set(xlabel = 'Fare', title= 'Distribution of Fare based on target variable')

plt.show()

It is evident from the plot that children did tend to have more chances of survival as compared to older individuals


In [21]:
sns.countplot(x = 'Sex', data= train, hue= 'Survived', palette= 'pastel')
plt.title('Survival chance based on Gender', fontsize = 15);

Now that's a clear pattern here. It seems like females were 3 times more likely to survive as compared to males.

In [23]:
sns.countplot(x = 'Pclass', data= train, hue= 'Survived', palette= 'pastel')
plt.title('Survival chance based on Ticket Class', fontsize = 15);

We can also conclude that people travelling in 3rd class were less likely to survive as compared to people travelling in first class


## **Filling Missing Values**

In [24]:
def knn_impute(df, na_target):
    df = df.copy()
    
    numeric_df = df.select_dtypes(np.number)
    non_na_columns = numeric_df.loc[: ,numeric_df.isna().sum() == 0].columns
    
    y_train = numeric_df.loc[numeric_df[na_target].isna() == False, na_target]
    X_train = numeric_df.loc[numeric_df[na_target].isna() == False, non_na_columns]
    X_test = numeric_df.loc[numeric_df[na_target].isna() == True, non_na_columns]
    
    knn = KNeighborsRegressor()
    knn.fit(X_train, y_train)
    
    y_pred = knn.predict(X_test)
    
    df.loc[df[na_target].isna() == True, na_target] = y_pred
    
    return df

In [25]:
sns.histplot(data1['Age'], color= 'teal', kde= True);

In [26]:
data2 = knn_impute(data1, 'Age')

In [28]:
sns.countplot(x = data2['Embarked'], palette= 'Set2');

Majority of the people embarked from Southampton, so we will just fill the missing values in Embarked column with `S`

In [29]:
data2['Embarked'].fillna('S', inplace= True)

In [30]:
plt.figure(figsize= (10, 6))
sns.histplot(data2['Fare'], color= 'Teal', kde= True);

Distribution of `Fare` is clearly skewed, therefore we will just fill the only missing value we have in this column with median

In [32]:
data2['Fare'].fillna(data1['Fare'].median(), inplace= True)

In [33]:
data2.isna().sum()


We don't have any more missing values in the dataset. Let's now move on to Encoding our categorical variables

## **Encoding**

In [34]:
data3 = data2.copy()
data3.info()

In [35]:
data3['Pclass'] = data3['Pclass'].apply(str)
target = target.apply(str)

In [36]:
le = LabelEncoder()
numerical_sex = le.fit_transform(data2['Sex'])
numerical_embarked = le.fit_transform(data2['Embarked'])

In [37]:
data3['Sex'] = numerical_sex
data3['Embarked'] = numerical_embarked

In [38]:
data3.head()

## **Scaling**

In [39]:
sc = StandardScaler()
data3[['Age', 'Fare']] = sc.fit_transform(data3[['Age', 'Fare']])


In [40]:
data3.head()

## **Feature Enginerring**

Both the `SibSp` and `Parch` column suggests whether the person was person was travelling with his family or not. So we will convert these features into a single feature called family

In [41]:
data3['Family'] = np.where(data3['SibSp'] + data3['Parch'] > 0, 1, 0)
data3.drop(['SibSp', 'Parch'], axis= 1, inplace= True)

In [42]:
data3.head()

In [43]:
train_final = data3.loc[:train.index.max(), :].copy()
test_final = data3.loc[train.index.max() + 1:, :].reset_index(drop=True).copy()


## **Logistic Regression**

In [44]:
lr = LogisticRegression(solver='liblinear', penalty= 'l2')
lr.fit(train_final, target)
test_pred = lr.predict(test_final)
train_pred = lr.predict(train_final)

In [45]:
print('Accuracy Score:', round(accuracy_score(target, train_pred), 3))


In [46]:
print(classification_report(target, train_pred))

In [47]:
pred_1 = lr.predict_proba(train_final)[:, 1]

In [48]:
sns.histplot(pred_1, bins= 5, color= 'teal')
plt.title('Histogram of predicted probabilities of Heart attack')
plt.xlabel('Predicted probabilities of heart attack')
plt.show()

Majority of the predicted probabilities are less than 0.2, this means that the number of people who will survive are very less according to our model

In [49]:
skplt.metrics.plot_confusion_matrix(target,train_pred, figsize=(6,6), cmap= 'YlGnBu');

In [50]:
fpr, tpr, thresholds = roc_curve(target, pred_1, pos_label= '1')

plt.figure(figsize=(6,4))
plt.plot(fpr, tpr, linewidth=2, color= 'teal')
plt.plot([0,1], [0,1], 'r--' )
plt.title('ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.show()

In [51]:
print('ROC AUC Score:', round(roc_auc_score(target, pred_1), 2))

In [52]:
submission = pd.DataFrame(test_ids, index= None)

In [53]:
submission['Survived'] = test_pred

In [55]:
submission.to_csv('submission.csv', index= None)