In [82]:
# 1. IMPORTS
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [83]:
# 2. LOAD DATA
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
print(f"Dataset loaded: {df.shape[0]} rows, {df.shape[1]} columns")

Dataset loaded: 891 rows, 12 columns


In [84]:
# 3. DATA EXPLORATION
print("\nMissing Values")
print(df.isnull().sum())

print("\nSurvival Rate")
print(df['Survived'].value_counts(normalize=True))


Missing Values
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

Survival Rate
Survived
0    0.616162
1    0.383838
Name: proportion, dtype: float64


In [85]:
# 4. DATA CLEANING

# Fill missing Age with median
df['Age'].fillna(df['Age'].median(), inplace=True)

# Drop Cabin (77% missing)
df = df.drop(['Cabin'], axis=1)

# Drop rows with missing Embarked
df = df.dropna(subset=['Embarked'])

print(f"\nAfter cleaning: {df.shape[0]} rows")



After cleaning: 889 rows


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)


In [86]:
# 5. FEATURE ENGINEERING

# Select features
X = df[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = df['Survived']

# Encode categorical variables
X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})
X['Embarked'] = X['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Sex'] = X['Sex'].map({'male': 0, 'female': 1})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Embarked'] = X['Embarked'].map({'C': 0, 'Q': 1, 'S': 2})


In [87]:
# 6. TRAIN/TEST SPLIT
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [88]:
# 7. MODEL TRAINING & EVALUATION

models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42)
}

print("\nMODEL COMPARISON")
for name, model in models.items():
    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)

    print(f"\n{name}:")
    print(f"  Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
    print(f"  Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


MODEL COMPARISON

Logistic Regression:
  Accuracy: 0.7753 (77.53%)
  Confusion Matrix:
[[84 25]
 [15 54]]

Decision Tree:
  Accuracy: 0.7416 (74.16%)
  Confusion Matrix:
[[80 29]
 [17 52]]

Random Forest:
  Accuracy: 0.7809 (78.09%)
  Confusion Matrix:
[[88 21]
 [18 51]]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [89]:
# 8. BEST MODEL ANALYSIS

best_model = RandomForestClassifier(random_state=42)
best_model.fit(X_train, y_train)
y_pred_best = best_model.predict(X_test)

print("\nBEST MODEL: RANDOM FOREST")
print(classification_report(y_test, y_pred_best,
                          target_names=['Died', 'Survived']))

# Feature importance
feature_imp = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

print("\nFEATURE IMPORTANCE")
print(feature_imp)


BEST MODEL: RANDOM FOREST
              precision    recall  f1-score   support

        Died       0.83      0.81      0.82       109
    Survived       0.71      0.74      0.72        69

    accuracy                           0.78       178
   macro avg       0.77      0.77      0.77       178
weighted avg       0.78      0.78      0.78       178


FEATURE IMPORTANCE
    Feature  Importance
5      Fare    0.270131
1       Sex    0.262782
2       Age    0.244866
0    Pclass    0.090056
3     SibSp    0.052715
4     Parch    0.040872
6  Embarked    0.038578
