In [134]:
import pandas as pd
import numpy as np

## Dataset Load

In [137]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Dataset Information

In [140]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [142]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Data cleaning

In [145]:
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])


In [147]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [149]:
df['Fare'] = df['Fare'].fillna(df['Fare'].median()) # fare has one missing value so we replaced by median

In [151]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
dtype: int64

#### Feature Engineering

In [154]:
# FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
# SibSp = siblings/spouse
# Parch = parents/children
# +1 = passenger

In [156]:
#IsAlone
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)

In [158]:
df[['SibSp', 'Parch', 'FamilySize', 'IsAlone']].head(10)

Unnamed: 0,SibSp,Parch,FamilySize,IsAlone
0,0,0,1,1
1,1,0,2,0
2,0,0,1,1
3,0,0,1,1
4,1,1,3,0
5,0,0,1,1
6,0,0,1,1
7,1,1,3,0
8,0,0,1,1
9,2,0,3,0


In [160]:
df.groupby('IsAlone')['Survived'].mean()

IsAlone
0    0.509091
1    0.268775
Name: Survived, dtype: float64

#### Categorical Data convert to Number Data 

In [163]:
df = pd.get_dummies(df, drop_first=True)

In [165]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,IsAlone,"Name_Abelseth, Miss. Karen Marie",...,Cabin_F,Cabin_F E46,Cabin_F E57,Cabin_F G63,Cabin_F2,Cabin_F33,Cabin_F4,Cabin_G6,Embarked_Q,Embarked_S
0,892,0,3,34.5,0,0,7.8292,1,1,False,...,False,False,False,False,False,False,False,False,True,False
1,893,1,3,47.0,1,0,7.0,2,0,False,...,False,False,False,False,False,False,False,False,False,True
2,894,0,2,62.0,0,0,9.6875,1,1,False,...,False,False,False,False,False,False,False,False,True,False
3,895,0,3,27.0,0,0,8.6625,1,1,False,...,False,False,False,False,False,False,False,False,False,True
4,896,1,3,22.0,1,1,12.2875,3,0,False,...,False,False,False,False,False,False,False,False,False,True


In [167]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'FamilySize', 'IsAlone', 'Name_Abelseth, Miss. Karen Marie',
       ...
       'Cabin_F', 'Cabin_F E46', 'Cabin_F E57', 'Cabin_F G63', 'Cabin_F2',
       'Cabin_F33', 'Cabin_F4', 'Cabin_G6', 'Embarked_Q', 'Embarked_S'],
      dtype='object', length=866)

#### Train-Test Split

In [170]:
y = df['Survived'] #Target

In [172]:
X = df.drop('Survived', axis=1) 
X = X.drop('PassengerId', axis=1) #PassengerId was removed as it does not carry predictive information

In [174]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [176]:
X_train.shape, X_test.shape

((334, 864), (84, 864))

#### Model Training
##### Model Import and create

In [179]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100,random_state=42)

##### Model Train

In [182]:
model.fit(X_train, y_train)

##### Prediction & Evaluation

In [185]:
#Prediction
y_pred = model.predict(X_test)

### Task-2

In [188]:
#Check Accuracy
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

1.0

In [190]:
X.columns


Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize', 'IsAlone',
       'Name_Abelseth, Miss. Karen Marie',
       'Name_Abelseth, Mr. Olaus Jorgensen',
       'Name_Abrahamsson, Mr. Abraham August Johannes',
       ...
       'Cabin_F', 'Cabin_F E46', 'Cabin_F E57', 'Cabin_F G63', 'Cabin_F2',
       'Cabin_F33', 'Cabin_F4', 'Cabin_G6', 'Embarked_Q', 'Embarked_S'],
      dtype='object', length=864)

In [192]:
X_train.equals(X_test)

False

In [194]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(
    model,
    X,
    y,
    cv=5,
    scoring='accuracy'
)

scores, scores.mean()


(array([1., 1., 1., 1., 1.]), 1.0)

In [195]:
from sklearn.ensemble import RandomForestClassifier

model_fixed = RandomForestClassifier(
    n_estimators=100,
    max_depth=5,
    min_samples_leaf=5,
    random_state=42
)

In [197]:
from sklearn.model_selection import StratifiedKFold, cross_val_score

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_fixed = cross_val_score(
    model_fixed,
    X,
    y,
    cv=cv,
    scoring='accuracy'
)

scores_fixed, scores_fixed.mean()

(array([0.64285714, 0.63095238, 0.63095238, 0.63855422, 0.63855422]),
 0.6363740676993689)

In [199]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

model_lr = LogisticRegression(max_iter=1000)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_lr = cross_val_score(
    model_lr,
    X,
    y,
    cv=cv,
    scoring='accuracy'
)

scores_lr, scores_lr.mean()


(array([1., 1., 1., 1., 1.]), 1.0)

In [200]:
X_reduced = X.drop('Sex_male', axis=1)

In [201]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

model_lr = LogisticRegression(max_iter=1000)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores_reduced = cross_val_score(
    model_lr,
    X_reduced,
    y,
    cv=cv,
    scoring='accuracy'
)

scores_reduced, scores_reduced.mean()

(array([0.61904762, 0.60714286, 0.64285714, 0.62650602, 0.65060241]),
 0.6292312105565119)

### Task-3

In [204]:
# Full features (with Sex_male)
X_full = X.copy()

# Reduced features (without Sex_male)
X_reduced = X.drop('Sex_male', axis=1)

In [205]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

model_baseline = LogisticRegression(max_iter=1000)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

baseline_scores = cross_val_score(
    model_baseline,
    X_reduced,
    y,
    cv=cv,
    scoring='accuracy'
)

print("Baseline scores:", baseline_scores)
print("Baseline mean accuracy:", baseline_scores.mean())


Baseline scores: [0.61904762 0.60714286 0.64285714 0.62650602 0.65060241]
Baseline mean accuracy: 0.6292312105565119


In [206]:
model_improved = LogisticRegression(
    max_iter=1000,
    C=0.5   # regularization
)

improved_scores = cross_val_score(
    model_improved,
    X_full,
    y,
    cv=cv,
    scoring='accuracy'
)

print("Improved scores:", improved_scores)
print("Improved mean accuracy:", improved_scores.mean())

Improved scores: [1. 1. 1. 1. 1.]
Improved mean accuracy: 1.0
