# 1. IMPORTS & SETUP

In [4]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 2. LOAD DATA

In [5]:
data = pd.read_csv('../datasets/train.csv')

# 3. EXPLORATORY DATA ANALYSIS (EDA)

In [6]:
print(data.head())
print(data.info())
print(data.describe())
print("Missing values per column:\n", data.isnull().sum())

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
<c

# 4. PREPROCESSING

In [8]:
titanic = data.drop(['Cabin', 'Ticket', 'Name'], axis=1)  # Drop high-missing and less useful columns
titanic = titanic.dropna()  #drop rows with missing values

# Separate features and target
X = titanic.drop('Survived', axis=1)
y = titanic['Survived']

# Identify types
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_features)
    ])

# 5. TRAIN-TEST SPLIT

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 6. MODEL PIPELINE AND TRAINING

In [10]:
model = Pipeline([
    ('prep', preprocessor),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])

model.fit(X_train, y_train)

# 7. PREDICTION AND EVALUATION

In [11]:
y_pred = model.predict(X_test)

print("Accuracy on test set:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy on test set: 0.7902097902097902

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.82      0.82        85
           1       0.74      0.74      0.74        58

    accuracy                           0.79       143
   macro avg       0.78      0.78      0.78       143
weighted avg       0.79      0.79      0.79       143


Confusion Matrix:
 [[70 15]
 [15 43]]


# 8. FEATURE IMPORTANCE (OPTIONAL)

In [12]:
# Logistic Regression coefficients for interpretation
coefs = model.named_steps['clf'].coef_[0]
feature_names = (
    numeric_features + 
    list(model.named_steps['prep'].named_transformers_['cat'].get_feature_names_out(categorical_features))
)
coef_df = pd.DataFrame({'feature': feature_names, 'coef': coefs})
print("\nTop features by absolute coefficient:\n", 
      coef_df.reindex(coef_df.coef.abs().sort_values(ascending=False).index).head(10))


Top features by absolute coefficient:
        feature      coef
6     Sex_male -2.483945
1       Pclass -1.019355
2          Age -0.599936
8   Embarked_S -0.478041
7   Embarked_Q -0.361805
3        SibSp -0.229601
0  PassengerId  0.126794
5         Fare -0.038044
4        Parch -0.035144
