In [217]:
# importing libraries
import pandas as pd
from tabulate import tabulate # type: ignore

#Labelling categorical data using LabelEncoder class (optional)
from sklearn.preprocessing import LabelEncoder
#for scaling x to a standard range of values
from sklearn.preprocessing import StandardScaler
# for splitting train-test dataset
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
#for logistic regression and Linear regression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
#for performance metrices
from sklearn.metrics import accuracy_score as accuracy, precision_score as precision, recall_score as recall, f1_score as f1, confusion_matrix, r2_score, root_mean_squared_error, mean_squared_error
#for knn
from sklearn.neighbors import KNeighborsClassifier
#for decision tree
from sklearn.tree import DecisionTreeClassifier
#for random forest
from sklearn.ensemble import RandomForestClassifier

In [218]:
# load the dataset
df = pd.read_csv("Titanic-Dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [219]:
# drop irrelevant columns
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin']) 
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [220]:
# handling missing values
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])
df

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.2500,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.9250,S
3,1,1,female,35.0,1,0,53.1000,S
4,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S
887,1,1,female,19.0,0,0,30.0000,S
888,0,3,female,28.0,1,2,23.4500,S
889,1,1,male,26.0,0,0,30.0000,C


In [221]:


#hot encoding for non ordinal categorical variables (sex or embarked(P, Q, S))
df = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)
df


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,7.2500,True,False,True
1,1,1,38.0,1,0,71.2833,False,False,False
2,1,3,26.0,0,0,7.9250,False,False,True
3,1,1,35.0,1,0,53.1000,False,False,True
4,0,3,35.0,0,0,8.0500,True,False,True
...,...,...,...,...,...,...,...,...,...
886,0,2,27.0,0,0,13.0000,True,False,True
887,1,1,19.0,0,0,30.0000,False,False,True
888,0,3,28.0,1,2,23.4500,False,False,True
889,1,1,26.0,0,0,30.0000,True,False,False


In [222]:
# scaling the numeric columns (age, fare, sibsp, Parch)
# standardization 
stdScalar = StandardScaler()
df[['Age', 'Fare', 'Parch', 'SibSp']] = stdScalar.fit_transform(df[['Age', 'Fare', 'Parch', 'SibSp']])
df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,-0.565736,0.432793,-0.473674,-0.502445,True,False,True
1,1,1,0.663861,0.432793,-0.473674,0.786845,False,False,False
2,1,3,-0.258337,-0.474545,-0.473674,-0.488854,False,False,True
3,1,1,0.433312,0.432793,-0.473674,0.420730,False,False,True
4,0,3,0.433312,-0.474545,-0.473674,-0.486337,True,False,True
...,...,...,...,...,...,...,...,...,...
886,0,2,-0.181487,-0.474545,-0.473674,-0.386671,True,False,True
887,1,1,-0.796286,-0.474545,-0.473674,-0.044381,False,False,True
888,0,3,-0.104637,0.432793,2.008933,-0.176263,False,False,True
889,1,1,-0.258337,-0.474545,-0.473674,-0.044381,True,False,False


In [223]:
# now Target(y) = Survived or not? 
# remove Target(y) from features(x)
# Drop rows with missing target values (if any)
x = df.drop('Survived', axis=1)
y = df['Survived']


df

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,Sex_male,Embarked_Q,Embarked_S
0,0,3,-0.565736,0.432793,-0.473674,-0.502445,True,False,True
1,1,1,0.663861,0.432793,-0.473674,0.786845,False,False,False
2,1,3,-0.258337,-0.474545,-0.473674,-0.488854,False,False,True
3,1,1,0.433312,0.432793,-0.473674,0.420730,False,False,True
4,0,3,0.433312,-0.474545,-0.473674,-0.486337,True,False,True
...,...,...,...,...,...,...,...,...,...
886,0,2,-0.181487,-0.474545,-0.473674,-0.386671,True,False,True
887,1,1,-0.796286,-0.474545,-0.473674,-0.044381,False,False,True
888,0,3,-0.104637,0.432793,2.008933,-0.176263,False,False,True
889,1,1,-0.258337,-0.474545,-0.473674,-0.044381,True,False,False


In [224]:
# split the data into 80% for training and 20% for test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [225]:
# model evaluation
def evaluate(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print(tabulate(confusion_matrix(y_test, y_pred), tablefmt='pretty'))
    print("Precision: ", precision(y_test, y_pred))
    print("Recall: ", recall(y_test, y_pred))
    print("F1-Score: ", f1(y_test, y_pred))


In [231]:
logReg = LogisticRegression(max_iter=1000)
knnModel = KNeighborsClassifier()
dtModel = DecisionTreeClassifier()
linReg = LinearRegression()
rfModel = RandomForestClassifier()

logReg.fit(x_train, y_train)
linReg.fit(x_train, y_train)
knnModel.fit(x_train, y_train)
dtModel.fit(x_train, y_train)
rfModel.fit(x_train, y_train)

print("Linear Regression Evaluation: ")
y_pred = linReg.predict(x_test)
print("MSE: ", mean_squared_error(y_test, y_pred))
print("R2-Score: ", r2_score(y_test, y_pred))
print("RMSE: ", root_mean_squared_error(y_test, y_pred))

print("\nLogistic Reg Evaluation: ")
evaluate(logReg, x_test, y_test)

print("\nKN Neighbours Evaluation: ")
evaluate(knnModel, x_test, y_test)

print("\nDecision Tree Evaluation: ")
evaluate(dtModel, x_test, y_test)

print("\nRandom Forest Evaluation: ")
evaluate(rfModel, x_test, y_test)

Linear Regression Evaluation: 
MSE:  0.15085466122417443
R2-Score:  0.3631707246002934
RMSE:  0.38840013030916254

Logistic Reg Evaluation: 
+----+----+
| 98 | 12 |
| 23 | 46 |
+----+----+
Precision:  0.7931034482758621
Recall:  0.6666666666666666
F1-Score:  0.7244094488188977

KN Neighbours Evaluation: 
+----+----+
| 98 | 12 |
| 23 | 46 |
+----+----+
Precision:  0.7931034482758621
Recall:  0.6666666666666666
F1-Score:  0.7244094488188977

Decision Tree Evaluation: 
+----+----+
| 97 | 13 |
| 18 | 51 |
+----+----+
Precision:  0.796875
Recall:  0.7391304347826086
F1-Score:  0.7669172932330827

Random Forest Evaluation: 
+----+----+
| 99 | 11 |
| 20 | 49 |
+----+----+
Precision:  0.8166666666666667
Recall:  0.7101449275362319
F1-Score:  0.7596899224806202


In [None]:
# hyperparameter tuning in random forest
param_grid = {
    'n_estimators' : [50, 100, 200],    # no of trees in forest
    'max_depth' : [3, 5, 10, None],     # max depth of each tree
    'min_samples_split' : [2, 5, 10],   # min samples required to split any internal node
    'min_samples_leaf' : [1, 2, 4]      # min samples required at each leaf nodes
}

rfModel1 = RandomForestClassifier()
gridSearch = GridSearchCV(rfModel1, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
gridSearch.fit(x_train, y_train)

print("Best Parameters:", gridSearch.best_params_)
print("Best Accuracy:", gridSearch.best_score_)

# grid search tuning for knn

knn_param = {
    'n_neighbors': [3, 5, 7, 9],  # number of neighbors to use
    'weights': ['uniform', 'distance'],  # weighting strategy
    'metric': ['euclidean', 'manhattan'],  # distance metric
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']  # nearest neighbor algorithm
}

logReg_param = {
    'C': [0.1, 1, 10],  # regularization strength
    'solver': ['liblinear', 'newton-cg'],  # solver choice idk check scikit.learn docs
    'max_iter': [100, 200, 300]  # maximum number of iterations
}

# same technique as random forest but just altered parameters (define model, apply tuning using GridSearchCV and respective parameters, now fit and find the best score)



8496.34s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
8496.50s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
8496.69s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
8496.87s - pydevd: Sending message related to process being replaced timed-out after 5 seconds
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD

Best Parameters: {'max_depth': 5, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best Accuracy: 0.8371811287304244
