Importing the Dependencies

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Pre-processing

In [None]:
 #loading the dataset into  a pandas dataframe
titanic_df = pd.read_csv('data/titanic.csv')
 
 #printing the first 5 rows
titanic_df.head()

In [None]:
#Getting some info about the data
titanic_df.info()

In [None]:
#cheking the number of columns and rows
titanic_df.shape

In [None]:
titanic_df.isnull().sum()

Handling the Missing values

###### We have three columns with missing values, Age, Embarked and Cabin. We have a ticket column which might indicate the type of cabin they might have had so we can drop the cabins column. The age column is important hence, we will fill the missing rows with a different method called imputation where we use central tendency values(mean, median or mode) to fill the values. we will also use the same method for the Embarked column

In [None]:
#Dropping the cabin column
titanic_df = titanic_df.drop(columns='Cabin', axis=1)


In [None]:
#replacing the missing values in the "Age" column
titanic_df['Age'].fillna(titanic_df['Age'].mean, inplace=True)

In [None]:
#Replacing the mssing values in "Embarked" column with moe value
titanic_df['Embarked'].fillna(titanic_df['Embarked'].mode(), inplace=True)

In [None]:
#Checking the missing values
titanic_df.isnull().sum()

In [None]:
#Getting some statistical measures about the data
titanic_df.describe()

In [None]:
#counting the number of survivors
titanic_df['Survived'].value_counts()

In [None]:
#counting the number of male and female
titanic_df['Sex'].value_counts()

Data Visualization

In [None]:
sns.set()

Not survived -> 0  
Survived -> 1

In [None]:
#making a countplot for "Surived" column
plt.figure(figsize=(3,3))
sns.countplot(x='Survived', data=titanic_df)

In [None]:
#making a countplot for "Sex" column
plt.figure(figsize=(3,3))
sns.countplot(x='Sex', data=titanic_df)

In [None]:
#Number of surivors by gender
plt.figure(figsize=(3,3))
sns.countplot(x='Sex', data=titanic_df, hue='Survived')

In [None]:
#making a countplot for ticket class
plt.figure(figsize=(3,3))
sns.countplot(x='Pclass', data=titanic_df)

Encoding the categorial columns

In [None]:
#Changing the sex and Embarked column to numerical values
titanic_df.replace({'Sex':{'male':0, 'female':1}, 'Embarked':{'S':0, 'C':1, 'Q':2}}, inplace=True)

In [None]:
#Separating Features and Target variables
x = titanic_df.drop(columns= ['PassengerId', 'Name', 'Ticket', 'Survived'], axis=1)
y = titanic_df['Survived']

Model Training and Evaluation

In [None]:
#splitting test data and training data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=2)

In [None]:
model = LogisticRegression()

In [None]:
#model training 
model.fit(x_train, y_train)

In [None]:
#checking the accuracy on training data
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)
print('The accuracy score of the training data is: ', training_data_accuracy)

In [None]:
#checking the accuracy on the test data
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)
print('The accuracy score of the test data is: ', training_data_accuracy)

Building a predictive system

In [None]:
input_data = ()

#change input data to numpy array
input_as_numpy = np.asarray(input_data)

#reshape the numpy array as we are predicting for only one instance
input_reshaped = input_as_numpy.reshape(1,-1)

prediction = model.predict(input_reshaped)
print(prediction)

if prediction[0] == 0:
    print('Heart is healthy')
else:
    print('Heart is defective')
