In [30]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [31]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [32]:
# Preprocessing
# Replacing null values in Fare and Age with the median value of set
# Encoding M/F to 0 and 1
df['Age'] = df['Age'].fillna(df['Age'].median())
df['Fare'] = df['Fare'].fillna(df['Fare'].median())
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})


In [33]:
# Selecting features
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
y = df['Survived']

In [34]:
print(f"Shape after loading: {df.shape}")
print(f"Shape of X: {X.shape}")
print(f"Shape of y: {y.shape}")
print(f"Any nulls left in X?\n{X.isnull().sum()}")

Shape after loading: (891, 12)
Shape of X: (891, 6)
Shape of y: (891,)
Any nulls left in X?
Pclass    0
Sex       0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64


In [35]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Train model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [37]:
# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.3f}")
print("\n", classification_report(y_test, y_pred))

Accuracy: 0.810

               precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.72      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



In [38]:
# Experiment 1: Train and test on same data
X_train_exp1 = X
X_test_exp1 = X
y_train_exp1 = y
y_test_exp1 = y

model_exp1 = LogisticRegression(max_iter=1000)
model_exp1.fit(X_train_exp1, y_train_exp1)
y_pred_exp1 = model_exp1.predict(X_test_exp1)

print("EXPERIMENT 1 - Train on ALL, Test on ALL")
print(f"Accuracy: {accuracy_score(y_test_exp1, y_pred_exp1):.3f}\n")

EXPERIMENT 1 - Train on ALL, Test on ALL
Accuracy: 0.796



In [39]:
"""  
My test set somehow got a better accuracy than the training set. 
I most likely stumbled on an easier set of data for the first model to test with.  
This experiment will show the real difference between the two.
"""


# Show training accuracy explicitly
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Accuracy on TRAINING data
y_train_pred = model.predict(X_train)
train_acc = accuracy_score(y_train, y_train_pred)

# Accuracy on TEST data  
y_test_pred = model.predict(X_test)
test_acc = accuracy_score(y_test, y_test_pred)

print(f"Training accuracy: {train_acc:.3f}")
print(f"Test accuracy: {test_acc:.3f}")
print(f"Difference: {train_acc - test_acc:.3f}")

Training accuracy: 0.802
Test accuracy: 0.810
Difference: -0.008


In [40]:
# Trying the same experiment, but with the Random Forest model
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier(random_state=42, max_depth=10)
model_rf.fit(X_train, y_train)

# Training accuracy
y_train_pred_rf = model_rf.predict(X_train)
print(f"RF Training accuracy: {accuracy_score(y_train, y_train_pred_rf):.3f}")

# Test accuracy
y_test_pred_rf = model_rf.predict(X_test)
print(f"RF Test accuracy: {accuracy_score(y_test, y_test_pred_rf):.3f}")

# The 

RF Training accuracy: 0.931
RF Test accuracy: 0.804


This model is **overfitting**. The model is memorizing training data but doesn't generalize as well.

In [41]:
# Feature Engineering
# Trying to create a "Family Size" feature of siblings + children
df['FamilySize'] = df['SibSp'] + df['Parch']

In [42]:
# Selecting features, with new feature
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'FamilySize']]
y = df['Survived']

In [43]:
# Retrain/retest split with new features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [44]:
# Retrain model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [45]:
# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy with FamilySize: {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred))

Accuracy with FamilySize: 0.810
              precision    recall  f1-score   support

           0       0.81      0.88      0.84       105
           1       0.80      0.72      0.76        74

    accuracy                           0.81       179
   macro avg       0.81      0.80      0.80       179
weighted avg       0.81      0.81      0.81       179



This didn't seem to help much, trying a different feature.

In [46]:
# Creating binary "IsAlone" feature
# If a passenger doesn't have to worry about other family or kids, they may move differently.
df['IsAlone'] = ((df['SibSp'] + df['Parch']) == 0).astype(int)

In [47]:
# Selecting new features, removing the FamilySize feature
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'IsAlone']]

In [48]:
# Retrain/retest split with new features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [49]:
# Retrain model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [50]:
# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy with IsAlone: {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred))

Accuracy with IsAlone: 0.799
              precision    recall  f1-score   support

           0       0.81      0.86      0.83       105
           1       0.78      0.72      0.75        74

    accuracy                           0.80       179
   macro avg       0.80      0.79      0.79       179
weighted avg       0.80      0.80      0.80       179



This actually made accuracy worse for some reason...

In [67]:
# Title from name (captures social status better than just Pclass)
df['Title'] = df['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# Group rare titles
df['Title'] = df['Title'].replace(['Lady', 'Countess','Capt', 'Col','Don', 
                                     'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
df['Title'] = df['Title'].replace('Mlle', 'Miss')
df['Title'] = df['Title'].replace('Ms', 'Miss')
df['Title'] = df['Title'].replace('Mme', 'Mrs')

# Convert to numbers
title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
df['Title'] = df['Title'].map(title_mapping)
df['Title'] = df['Title'].fillna(0)

# Add to features
X = df[['Pclass', 'Sex', 'Age', 'Fare', 'Title', 'IsAlone']]

In [68]:
# Retrain/retest split with new features
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Retrain model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [69]:
# Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy with IsAlone and Title: {accuracy_score(y_test, y_pred):.3f}")
print(classification_report(y_test, y_pred))

Accuracy with IsAlone and Title: 0.782
              precision    recall  f1-score   support

           0       0.81      0.82      0.82       105
           1       0.74      0.73      0.73        74

    accuracy                           0.78       179
   macro avg       0.78      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179



Sometimes, simpler actually is better...