In [35]:
#importing necessary modules
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

In [36]:
#importing csv file.
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Data Preprocessing

In [37]:
#checking for NULL values in train dataset
train_data.isnull().sum()
#We found that Age, Cabin and Embarked columns have NULL values.

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [38]:
#checking for NULL values in test dataset
test_data.isnull().sum()
#We found that Age and Cabin have NULL values.

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [39]:
# Data Preprocessing
def preprocess_data(data):
    # Filling NaN values in Age with its mean
    data["Age"] = data["Age"].fillna(data["Age"].mean())
    
    # Filling NaN values in Cabin with its mode value
    data["Cabin"] = data["Cabin"].fillna(data["Cabin"].mode()[0])
    
    # Filling NaN values in Embarked with its mode value
    data["Embarked"] = data["Embarked"].fillna(data["Embarked"].mode()[0])
    
    # #coverting datatype of Pclass to str because we need to dummify all variables.
    data["Pclass"] = data["Pclass"].astype(str)
    
    # Getting Dummies from all categorical variables
    
    # select object type columns
    cat_cols = data.select_dtypes(include=['object']).columns
    
    # dummify selected columns
    data = pd.get_dummies(data, columns=cat_cols, prefix=cat_cols, prefix_sep='_', drop_first=True)
    
    return data

In [40]:
# Preprocess train and test data
train_data = preprocess_data(train_data)
test_data = preprocess_data(test_data)

In [41]:
#checking for NULL values in train dataset after data cleaning
train_data.isnull().sum()
#No NaN values reamaining

PassengerId    0
Survived       0
Age            0
SibSp          0
Parch          0
              ..
Cabin_F4       0
Cabin_G6       0
Cabin_T        0
Embarked_Q     0
Embarked_S     0
Length: 1727, dtype: int64

In [42]:
#checking for NULL values in test dataset after data cleaning
test_data.isnull().sum()
#No NaN values reamaining

PassengerId    0
Age            0
SibSp          0
Parch          0
Fare           1
              ..
Cabin_F33      0
Cabin_F4       0
Cabin_G6       0
Embarked_Q     0
Embarked_S     0
Length: 864, dtype: int64

In [43]:
#Printing Train Data after dummifying  categorical variables.
train_data.head()

Unnamed: 0,PassengerId,Survived,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,"Name_Abbott, Mr. Rossmore Edward","Name_Abbott, Mrs. Stanton (Rosa Hunt)",...,Cabin_F G63,Cabin_F G73,Cabin_F2,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_T,Embarked_Q,Embarked_S
0,1,0,22.0,1,0,7.25,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2,1,38.0,1,0,71.2833,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,26.0,0,0,7.925,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
3,4,1,35.0,1,0,53.1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,0,35.0,0,0,8.05,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [44]:
#Printing Test Data after dummifying  categorical variables.
test_data.head()

Unnamed: 0,PassengerId,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,"Name_Abelseth, Miss. Karen Marie","Name_Abelseth, Mr. Olaus Jorgensen","Name_Abrahamsson, Mr. Abraham August Johannes",...,Cabin_F,Cabin_F E46,Cabin_F E57,Cabin_F G63,Cabin_F2,Cabin_F33,Cabin_F4,Cabin_G6,Embarked_Q,Embarked_S
0,892,34.5,0,0,7.8292,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1,893,47.0,1,0,7.0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,894,62.0,0,0,9.6875,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,895,27.0,0,0,8.6625,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,896,22.0,1,1,12.2875,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [45]:
#Train Data Shape After dummifying.
print(train_data.shape)

#Data Preporocessing Done Here.

(891, 1727)


In [46]:
#Test Data Shape After dummifying.
print(test_data.shape)

#Data Preporocessing Done Here.

(418, 864)


In [47]:
# Extracting labels
train_labels = train_data.pop("Survived")

In [48]:
# Splitting the data into train and validation sets
x_train, x_val, y_train, y_val = train_test_split(train_data, train_labels, test_size=0.2, random_state=42)

In [49]:
train_labels

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

# Model Training

In [50]:
#creating object of random forest classifier.
rf_model = RandomForestClassifier()

#Passing training data to random forest classifier
rf_model.fit(x_train, y_train)

RandomForestClassifier()

# Model Evaluation

In [51]:
# Model Evaluation on validation set
y_val_proba = rf_model.predict_proba(x_val)[:, 1]
roc_auc_val = roc_auc_score(y_val, y_val_proba)
print("ROC-AUC Score on Validation Set:", roc_auc_val)

ROC-AUC Score on Validation Set: 0.8882882882882881
