<div style="color:white;
           display:fill;
           border-radius:25px;
           background-color:Red;
           font-size:210%;
           font-family:Verdana;
           letter-spacing:0.5px">
<p style="padding: 10px;
          color:white;
          text-align:center;"
          >
       WELCOME TO MY NOTEBOOK
</p>
</div>

# Dataset- Titanic

![](https://gifdb.com/images/high/sinking-ship-titanic-split-j4un4l6maby064ha.gif)

# Import all the Libraries

In [None]:
# Import all the necessary libraries
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.preprocessing import MinMaxScaler,OneHotEncoder, LabelEncoder, RobustScaler
from sklearn.metrics import accuracy_score,roc_auc_score,precision_score, recall_score, f1_score,ConfusionMatrixDisplay,classification_report
import warnings
warnings.filterwarnings("ignore")

# Read the Train Data 

In [None]:
# Read the train set
train_data= pd.read_csv("/kaggle/input/titanic/train.csv")
train_data.head()

# Read the Test Data

In [None]:
# Read the test set
test_data=pd.read_csv("/kaggle/input/titanic/test.csv")
test_data.head()

In [None]:
# Check the shape of the training_data
train_data.shape

In [None]:
# check the shape of the testing_data
test_data.shape

In [None]:
# check the datatype of training data
train_data.info()

In [None]:
# Training data description in statistics
train_data.describe()

In [None]:
# To check Is there any null values in the train_dataset
train_data.isna().sum()

In [None]:
# To check Is there any null values in the test_dataset
test_data.isna().sum()

# To check the duplicates value in the training data

In [None]:
train_data.duplicated().sum()

> Here we can see that there is no duplicate value in the Training Data

In [None]:
sns.heatmap(train_data.corr(), annot=True, cmap="Blues", fmt=".2f")

> Here we notice that Parch and Fare are the features that have impact on the survival rate.

# Exploratory Data Analysis

In [None]:
# check the count of target variable in training data
train_data["Survived"].value_counts()

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(data=train_data, x=train_data["Survived"], color="red")
plt.show()

In [None]:
# check the count of Sex Column
train_data["Sex"].value_counts()

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(data=train_data, x=train_data["Sex"], color="blue")
plt.show()

In [None]:
# store the test ID
PassengerId=test_data["PassengerId"]

In [None]:
# Drop the unnecessary columns from both testing and Training data
train_data.drop(['Name','PassengerId','Ticket', 'Cabin'], axis=1, inplace=True)
test_data.drop(['Name','PassengerId','Ticket', 'Cabin'], axis=1, inplace=True)

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(data=train_data, x='Age', hue='Survived', multiple='stack', bins=10)
plt.show()

In [None]:
# Find the median of Age 
median=train_data["Age"].median()
median

# Lets handle the mising values in train and tets set

In [None]:
train_data.isna().sum()

In [None]:
# Lets handle the missing value in Age and fill the missing values with the random numbers
lower_limit=median-20
upper_limit=median+20
missing_values=177


# Generate the floting point numbers between the specified lower and upper limit, here missing_values indicate the number of random numbers you want to generate
random_numbers= np.random.uniform(lower_limit, upper_limit, missing_values)



train_data["Age"]=train_data["Age"].fillna(pd.Series(random_numbers, index=train_data.index[train_data["Age"].isnull()]))
train_data["Age"].isna().sum()

In [None]:
plt.figure(figsize=(5,5))
sns.histplot(data=train_data, x='Age', hue='Survived', multiple='stack', bins=10)
plt.show()

In [None]:
test_data.isna().sum()

In [None]:
# Lets handle the missing value in Age
lower_limit=median-15
upper_limit=median+15
missing_values=86

# Generate the floting point numbers between the specified lower and upper limit, here missing_values indicate the number of random numbers you want to generate
random_numbers= np.random.uniform(lower_limit, upper_limit, missing_values)



test_data["Age"]=test_data["Age"].fillna(pd.Series(random_numbers, index=test_data.index[test_data["Age"].isnull()]))
test_data["Age"].isna().sum()

In [None]:
imputer= SimpleImputer(missing_values=np.nan, strategy="median", fill_value=None)
test_data["Fare"]= imputer.fit_transform(test_data[["Fare"]])

In [None]:
# Filling the missing values for categorical variables with mode
train_data['Embarked']=train_data['Embarked'].fillna(train_data['Embarked'].mode()[0])
test_data['Embarked']=test_data['Embarked'].fillna(test_data['Embarked'].mode()[0])

In [None]:
# To check again Is there any null values in the train set
train_data.isna().sum()

In [None]:
# To check again Is there any missing value in the test set
test_data.isna().sum()

# Lets have a look on Categorical Columns after handling the missing values

In [None]:
train_data["Embarked"].unique()

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(data=train_data, x=train_data["Embarked"], color="green")
plt.show()

In [None]:
train_data["Parch"].unique()

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(data=train_data, x=train_data["Parch"], color="brown")
plt.show()

In [None]:
train_data["SibSp"].unique()

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(data=train_data, x=train_data["SibSp"], color="grey")
plt.show()

In [None]:
train_data["Pclass"].unique()

In [None]:
plt.figure(figsize=(5,4))
sns.countplot(data=train_data, x=train_data["Pclass"], color="black")
plt.show()

In [None]:
# Convert Age Column into categories 
df= pd.DataFrame(train_data["Age"])

# Define class conversion function
def convert_to_class(age):
    return_value=-1
    if int(0) <= int(age) < int(10):
        return_value= 0
        
    elif int(10) <= int(age) < int(20):
        return_value= 1
     
    elif int(20) <= int(age) < int(30):
        return_value= 2
    
    else:
        return_value= 3

    
    return return_value

# Apply the conversion function to create the new column
df["Age"]=df["Age"].apply(lambda Age: convert_to_class(Age))
train_data["Age"]=df["Age"]

In [None]:
# Create a DataFrame
df= pd.DataFrame(test_data["Age"])

# Define class conversion function
def convert_to_class(age):
    return_value=-1
    if int(0) <= int(age) < int(10):
        return_value= 0
        
    elif int(10) <= int(age) < int(20):
        return_value= 1
     
    elif int(20) <= int(age) < int(30):
        return_value= 2
    
    else:
        return_value= 3

    return return_value

# Apply the conversion function to create the new column
df["Age"]=df["Age"].apply(lambda Age: convert_to_class(Age))
test_data["Age"]=df["Age"]

In [None]:
# Convert Fare column into categories

# Create a DataFrame
df= pd.DataFrame(train_data["Fare"])

# Define class conversion function
def convert_to_class(fare):
    return_value=-1
    if int(0) <= int(fare) < int(100):
        return_value= 0
    elif int(100) <= int(fare) < int(200):
        return_value= 1
    elif int(200) <= int(fare) < int(300):
        return_value= 2
    elif int(300) <= int(fare) < int(400):
        return_value= 3
     
    elif int(400) <= int(fare) < int(500):
        return_value= 4
    else:
        return_value=5

    return return_value

# Apply the conversion function to create the new column
df["Fare"]=df["Fare"].apply(lambda Fare: convert_to_class(Fare))
train_data["Fare"]=df["Fare"]

In [None]:
# Create a DataFrame
df= pd.DataFrame(test_data["Fare"])

# Define class conversion function
def convert_to_class(fare):
    return_value=-1
    if int(0) <= int(fare) < int(100):
        return_value= 0
    elif int(100) <= int(fare) < int(200):
        return_value= 1
    elif int(200) <= int(fare) < int(300):
        return_value= 2
    elif int(300) <= int(fare) < int(400):
        return_value= 3
     
    elif int(400) <= int(fare) < int(500):
        return_value= 4
    else:
        return_value= 5

    return return_value

# Apply the conversion function to create the new column
df["Fare"]=df["Fare"].apply(lambda Fare: convert_to_class(Fare))
test_data["Fare"]=df["Fare"]

In [None]:
# Create a new family column by combining SibSp and Parch
train_data["Family"]= train_data["SibSp"] + train_data["Parch"]
test_data["Family"]= test_data["SibSp"] + test_data["Parch"]

In [None]:
train_data.drop(["SibSp","Parch"], axis=1, inplace=True)
test_data.drop(["SibSp","Parch"], axis=1, inplace=True)

# Divide the dataset into train and test data

In [None]:
x_train= train_data.drop("Survived", axis=1)
y_train=train_data["Survived"]
x_test= test_data

In [None]:
# Getting categorical and numerical columns
def get_num_cat_columns(dataframe):
    categorical_cols=dataframe.select_dtypes(include="object").columns
    numerical_cols=dataframe.select_dtypes(exclude="object").columns

    return categorical_cols, numerical_cols

In [None]:
categorical_cols, numerical_cols=get_num_cat_columns(x_train)

In [None]:
categorical_cols

In [None]:
numerical_cols

# Dataset Preprocessing

In [None]:
le= LabelEncoder()
for col in categorical_cols:
    x_train[col]= le.fit_transform(x_train[col])
    x_test[col]= le.transform(x_test[col])

In [None]:
x_train

In [None]:
x_test

# Lets do the Modelling 
> # Ensemble Learning-- Voting Classifier

In [None]:
rf1=RandomForestClassifier(max_features=3, min_samples_leaf=3, min_samples_split=10)
rf2=RandomForestClassifier(max_features=3, min_samples_leaf=3, min_samples_split=10)
rf3=RandomForestClassifier(max_features=3, min_samples_leaf=3, min_samples_split=10)

voting_clf= VotingClassifier(estimators=[('random_forest1',rf1),('random_forest2', rf2),('random_forest3',rf3)], voting='hard', n_jobs=-1)
voting_clf.fit(x_train, y_train)
y_pred= voting_clf.predict(x_test)

In [None]:
titanic_predictions= pd.DataFrame({"PassengerId":PassengerId, "Survived":y_pred})
titanic_predictions

# Save The Predictions

In [None]:
titanic_predictions.to_csv("Predictions.csv", index = False)