In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
titanic_data = pd.read_csv(r'/kaggle/input/titanic/train.csv')
titanic_data

In [None]:
titanic_data.info()

In [None]:
print('Missing values for each column:')
print('')
print(titanic_data.isnull().sum())

 ### As Cabin column has almost 80% of the data missing, so its better to drop that column.
 
 ### And the Embarked column has 2 missing values so we will drop those two rows.

 ### We will try to fill the age column using other column.

In [None]:
# Heatmap to understand more about 'Age' column.

plt.figure(figsize=(8,6))
sns.heatmap(titanic_data.corr(),annot=True,cmap='rainbow',linewidths=.02,linecolor='black')

### It can be seen in above map that 'Age' column is correlated to either 'Pclass' or 'SibSp'. Lets use 'Pclass' to fill the missing values of 'Age' column.

In [None]:
# Boxplot to 'Age' and 'Pclass' column.

plt.figure(figsize=(8,6))
sns.boxplot(x='Pclass',y='Age',data=titanic_data, saturation=0.4, palette=['blue','green','red'])

### It makes sense as 1st class passenger has to be rich and generally rich people are aged.

### Now this graph can be used to fill the 'Age' column by creating a function.

In [None]:
# Creating function for missing values using above graph.

def age_fill(i):
    Pclass = i[0]
    Age= i[0]
    
    if pd.isnull(Age):
        if Pclass ==1:
            return 37
        elif Pclass ==2:
            return 29
        else:
            return 23
    else:
        return Age

In [None]:
# Applying the function.

titanic_data['Age'] = titanic_data[['Age','Pclass']].apply(age_fill,axis=1)

### Getting Dummy Variables for categorical column 'Sex' and dropping some columns.

In [None]:
sex = pd.get_dummies(titanic_data['Sex'],drop_first=True, prefix='sex')

titanic_data = pd.concat([sex, titanic_data],axis=1)
titanic_data.drop('Sex',axis=1,inplace=True)

In [None]:
titanic_data.drop(['Name','Fare','Ticket', 'Cabin', 'Embarked','SibSp','Parch'],axis=1,inplace=True)
print('Final dataset:')
titanic_data.head()

## Logistic Regression


In [None]:
from sklearn.model_selection import train_test_split

X = titanic_data.drop('Survived',axis=1)
y = titanic_data['Survived']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=101)

In [None]:
from sklearn.linear_model import LogisticRegression

logR = LogisticRegression(max_iter=5000)

In [None]:
logR.fit(X_train,y_train)

In [None]:
predictions = logR.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score,classification_report

print('Accuracy Score = ', accuracy_score(y_test,predictions)*100)
print('\n')
print(classification_report(y_test,predictions))