In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("/kaggle/input/titanic/train.csv")
test = pd.read_csv("/kaggle/input/titanic/test.csv")

# Analysing the data

In [None]:
train.head()

In [None]:
train.columns

In [None]:
train.isnull().sum() #check for any missing values

In [None]:
train[train.isnull().any(axis=1)]

In [None]:
def add_salutation_to_data(df):
    df['Salutation'] = df.Name.str.extract('([A-Za-z]+)\.') #extracting Name initials
    df['Salutation'].replace(
        ['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
        ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],
        inplace=True
    )
    return df

In [None]:
# train['Salutation'] = train.Name.str.extract('([A-Za-z]+)\.') #extracting Name initials

In [None]:
# train['Salutation'].unique()

In [None]:
#analysing some salutation values.
# words = ['Mlle', 'Mme', 'Rev', 'Countess', 'Jonkheer', 'Col']
# for gender, age, name ,salutation in zip(train['Sex'],train['Age'] , train['Name'] ,train['Salutation']):
#     if salutation in words:
#         print(gender, age, name, salutation)

In [None]:
# train['Salutation'].replace(
#     ['Mlle','Mme','Ms','Dr','Major','Lady','Countess','Jonkheer','Col','Rev','Capt','Sir','Don'],
#     ['Miss','Miss','Miss','Mr','Mr','Mrs','Mrs','Other','Other','Other','Mr','Mr','Mr'],
#     inplace=True
# )

In [None]:
train = add_salutation_to_data(train)

In [None]:
train['Age'].groupby(train['Salutation']).mean() #average age value for each salutation

In [None]:
train.loc[(train.Age.isnull()) & (train.Salutation =='Master'),'Age'] = 5
train.loc[(train.Age.isnull()) & (train.Salutation =='Miss'),'Age'] = 22
train.loc[(train.Age.isnull()) & (train.Salutation =='Mr'),'Age'] = 33
train.loc[(train.Age.isnull()) & (train.Salutation =='Mrs'),'Age'] = 36
train.loc[(train.Age.isnull()) & (train.Salutation =='Other'),'Age'] = 46

In [None]:
train.isnull().sum() #now null values present in the cabin column and embarked

In [None]:
train_new = train.drop(['PassengerId', 'Embarked', 'Name', 'Ticket', 'Salutation', 'Cabin'], axis=1) #removing unwanted features

In [None]:
train_new['Sex'].replace(
    ['male', 'female'],
    [0, 1], inplace=True
)

#replace male as 0 and female as 1

In [None]:
train_new

In [None]:
train_new.describe().T

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

### Checking if Ticket fare has any relationship with survival rate

In [None]:
# Create a line plot of 'Fare' with different colors for survived and not survived
plt.figure(figsize=(15, 8))

# Plot 'Fare' for survived passengers in green
plt.plot(train_new[train_new['Survived'] == 1]['Fare'], color='green', label='Survived')

# Plot 'Fare' for not survived passengers in red
plt.plot(train_new[train_new['Survived'] == 0]['Fare'], color='red', label='Not Survived')

plt.xlabel('Passenger Index')
plt.ylabel('Fare')
plt.title('Line Plot of Fare by Survival')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
train_new.columns

### Univariate Data Analysis

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(16, 10))
sns.countplot(x = 'Survived',data=train_new,ax=axes[0,0])
sns.countplot(x = 'Pclass',data=train_new,ax=axes[0,1])
sns.countplot(x = 'Sex',data=train_new,ax=axes[0,2])
sns.countplot(x = 'SibSp',data=train_new,ax=axes[0,3])
sns.countplot(x = 'Parch',data=train_new,ax=axes[1,0])
sns.displot(train_new['Fare'], kde=True,ax=axes[1,2])
sns.displot(train_new['Age'],kde=True,ax=axes[1,3])

### Bivariate Analysis

In [None]:
figbi, axesbi = plt.subplots(2, 3, figsize=(16, 10))
train_new.groupby('Pclass')['Survived'].mean().plot(kind='barh',ax=axesbi[0,0],xlim=[0,1])
train_new.groupby('SibSp')['Survived'].mean().plot(kind='barh',ax=axesbi[0,1],xlim=[0,1])
train_new.groupby('Parch')['Survived'].mean().plot(kind='barh',ax=axesbi[0,2],xlim=[0,1])
train_new.groupby('Sex')['Survived'].mean().plot(kind='barh',ax=axesbi[1,0],xlim=[0,1])
sns.boxplot(x="Survived", y="Age", data=train_new,ax=axesbi[1,1])
sns.boxplot(x="Survived", y="Fare", data=train_new,ax=axesbi[1,2])
plt.plot()

### Analysing the distribution of the fare values and removing outliers


In [None]:
figure=train_new.Fare.hist(bins=50)
figure.set_title('Fare')
figure.set_xlabel('Fare')
figure.set_ylabel('No. of Passengers')
plt.plot() 

In [None]:
print((train_new.Fare == 0).sum()) #records where fare is zero

In [None]:
train_new.Fare.mean()

In [None]:
#fare can never be zero, so imputing zero values with mean fare value
train_new.loc[train_new.Fare == 0, 'Fare'] = train_new.Fare.mean()

In [None]:
IQR= train_new.Fare.quantile(0.75)-train_new.Fare.quantile(0.25)
lower_bridge=train_new['Fare'].quantile(0.25)-(IQR*1.5)
upper_bridge=train_new['Fare'].quantile(0.75)+(IQR*1.5)
print(lower_bridge)
print(upper_bridge)

In [None]:
#removing outliers in fare using IQR method.
train_new.loc[train_new['Fare']>=upper_bridge,'Fare'] = upper_bridge

In [None]:
import matplotlib.pyplot as plt

figure=train_new.Fare.hist(bins=50)
figure.set_title('Fare')
figure.set_xlabel('Fare')
figure.set_ylabel('No. of Passengers')
plt.plot() 

### Correlation Matrix

In [None]:
import seaborn as sns

f, ax = plt.subplots(figsize=(6, 4))
corr = train_new.corr()
sns.heatmap(corr,
            mask=np.zeros_like(corr, dtype=bool), 
            cmap=sns.diverging_palette(220, 10, as_cmap=True),
            square=True, ax=ax)
plt.plot()

In [None]:
train_new.describe().T

## One-Hot Encoding for Logistic Regression

In [None]:
train_new.SibSp.unique()

In [None]:
train_new = pd.get_dummies(train_new, columns = ['Pclass', 'SibSp', 'Parch'], drop_first = True, dtype=np.intc)

In [None]:
train_new

In [None]:
train_new.columns

## Splitting the data (Features & Target)

In [None]:
X = train_new.drop('Survived', axis = 1)
y = train_new.Survived

## Min-Max Scaling the Data

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
minMaxScaler = MinMaxScaler()

In [None]:
X[['Age', 'Fare']]

In [None]:
X[['Age', 'Fare']] = minMaxScaler.fit_transform(X [['Age', 'Fare']])

* The distribution of the data is not affected after Min-Max Scaling

In [None]:
import seaborn as sns

In [None]:
sns.displot(X['Fare'])
sns.displot(X['Age'])

## Standard Scaling the data.

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
sc = StandardScaler()

In [None]:
X[['Age', 'Fare']] = sc.fit_transform(X [['Age', 'Fare']])

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify = y)

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
np.unique(y_test, return_counts=True)

# Training

## Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
lr = LogisticRegression()

In [None]:
#trained by applying standard scaling
lr.fit(X_train,y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [None]:
print(accuracy_score(lr.predict(X_test), y_test))

In [None]:
print(classification_report(lr.predict(X_test), y_test))

In [None]:
print(confusion_matrix(lr.predict(X_test), y_test))

## Support Vector Machine

In [None]:
from sklearn.svm import SVC

In [None]:
clf = SVC(kernel = "poly", gamma="auto" ,verbose = True)

In [None]:
clf.fit(X_train, y_train)

In [None]:
score, cn, report = evaluate_clf_model(clf, X_test, y_test)

In [None]:
print(f"Train Score:  {clf.score(X_train, y_train)}")
print(f"Test Score: {score}")

In [None]:
cn

#array([[120,  28],
#        [ 17,  58]])


In [None]:
print(report)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(random_state=0, criterion="entropy", max_depth=25, min_samples_leaf=5, min_samples_split=100)

In [None]:
clf.fit(X_train, y_train)

In [None]:
from sklearn.tree import export_graphviz

# Export the decision tree as a DOT file
dot_data = export_graphviz(clf, out_file="tree.dot", 
                          feature_names=X.columns,  
                          class_names=['Not Survived', 'Survived'],
                          filled=True, rounded=True,
                          special_characters=True)

In [None]:
!dot -Tpng tree.dot -o tree.png

In [None]:
score, cn, report = evaluate_clf_model(clf, X_test, y_test)

In [None]:
print(f"Train Score:  {clf.score(X_train, y_train)}")
print(f"Test Score: {score}")

In [None]:
print(cn)

In [None]:
print(report)

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=3, min_samples_leaf=4, min_samples_split=20)

In [None]:
clf.fit(X_train, y_train)

In [None]:
score, cn, report = evaluate_clf_model(clf, X_test, y_test)

In [None]:
print(f"Train Score:  {clf.score(X_train, y_train)}")
print(f"Test Score: {score}")

In [None]:
print(cn)

In [None]:
print(report)

## Model Performance Evaluation

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


def evaluate_clf_model(model, X_test, y_test):
    """
        Function to evaluate the performance of a classifier model with accuracy_score, 
        classification report, confusion matrix.
        
        Args:-
            model - Model object trained with X_train, y_train.
            X_test - X input for testing the data.
            y_test - Output for the input X_test.
            
        Returns :-
            accuracyScore - np.int - Classification score of the model.
            confusionMatrix - np.ndarray - Confusion matrix for the given data.
            classificationReport - str - Classification report for the given data.
    """
    
    y_pred = model.predict(X_test)
    accuracyScore = accuracy_score(y_pred, y_test)
    confusionMatrix = confusion_matrix(y_pred, y_test)
    classificationReport = classification_report(y_pred, y_test)
    return accuracyScore, confusionMatrix, classificationReport
    

## Preparing the test data

In [None]:
test.isna().sum()

In [None]:
test = add_salutation_to_data(test)

In [None]:
test

In [None]:
test['Salutation'].replace(['Dona'], ['Miss'], inplace = True)

In [None]:
test['Age'].groupby(test['Salutation']).mean()

In [None]:
test.loc[(test.Age.isnull()) & (test.Salutation == "Master"), 'Age'] = 7
test.loc[(test.Age.isnull()) & (test.Salutation == "Miss"), 'Age'] = 22
test.loc[(test.Age.isnull()) & (test.Salutation == "Mr"), 'Age'] = 32
test.loc[(test.Age.isnull()) & (test.Salutation == "Mrs"), 'Age'] = 39
test.loc[(test.Age.isnull()) & (test.Salutation == "Other"), 'Age'] = 43

In [None]:
test.isna().sum()

In [None]:
test.loc[(test.Fare == 0), 'Fare'] = test.Fare.mean()
test['Fare'] = test['Fare'].fillna(test.Fare.mean())

In [None]:
test.isna().sum()

In [None]:
test = test.drop(['PassengerId','Name', 'Ticket', 'Cabin', 'Embarked', 'Salutation'], axis=1)

In [None]:
test[['Age', 'Fare']] = minMaxScaler.transform(test[['Age', 'Fare']])

In [None]:
test['Sex'].replace(['male', 'female'], [0, 1], inplace = True)

In [None]:
test

In [None]:
test.describe().T

In [None]:
prediction = clf.predict(test)

In [None]:
def prepare_submission(prediction, fileName=None):
    output = "submission.csv"
    submission = pd.read_csv("/kaggle/input/titanic/gender_submission.csv")
    submission['Survived'] = prediction
    if fileName:
        output = fileName
    submission.to_csv(output, index=False)
    return submission

In [None]:
submission = prepare_submission(prediction, "Random Forest Fine Tuned.csv")