# Introduction to Machine Learning – Titanic Dataset

This notebook introduces basic supervised learning with:
- Preprocessing (missing values, encoding)
- Feature scaling
- Pipeline creation with Scikit-learn
- Model training & evaluation
- Model saving and serving with FastAPI

In [1]:
# 📦 Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import joblib

In [2]:
# 📥 Load Titanic Dataset
url = "https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv"
df = pd.read_csv(url)
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# 🧹 Select Features and Target
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked']
target = 'Survived'

X = df[features]
y = df[target]

In [4]:
# 🔧 Define Preprocessing Pipeline
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

In [5]:
# 🔁 Full Pipeline with Model
clf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
clf_pipeline.fit(X_train, y_train)

# Evaluate
y_pred = clf_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.84      0.82       105
           1       0.76      0.72      0.74        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179



## Save the Trained Pipeline

In [6]:
joblib.dump(clf_pipeline, "titanic_pipeline.pkl")

['titanic_pipeline.pkl']

## Exercise 1: Try a Different Classifier
Replace the logistic regression model in the pipeline with another classifier, such as `RandomForestClassifier`, and compare the results.

```python
from sklearn.ensemble import RandomForestClassifier
# Replace the classifier in clf_pipeline
```

*What changes do you observe in precision and recall?*

In [7]:
# Exercise 1: RandomForestClassifier Implementation
from sklearn.ensemble import RandomForestClassifier

# Create a new pipeline with RandomForestClassifier
rf_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the RandomForest model
rf_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_pipeline.predict(X_test)

# Print results for comparison
print("=== LOGISTIC REGRESSION RESULTS ===")
print(classification_report(y_test, y_pred))

print("\n=== RANDOM FOREST RESULTS ===")
print(classification_report(y_test, y_pred_rf))

=== LOGISTIC REGRESSION RESULTS ===
              precision    recall  f1-score   support

           0       0.81      0.84      0.82       105
           1       0.76      0.72      0.74        74

    accuracy                           0.79       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.79      0.79      0.79       179


=== RANDOM FOREST RESULTS ===
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       105
           1       0.73      0.76      0.74        74

    accuracy                           0.78       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.78      0.78      0.78       179



Exercice 1

In [None]:
# Detailed Comparison Analysis
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate metrics for both models
lr_accuracy = accuracy_score(y_test, y_pred)
lr_precision = precision_score(y_test, y_pred)
lr_recall = recall_score(y_test, y_pred)
lr_f1 = f1_score(y_test, y_pred)

rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

# Create comparison table
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score'],
    'Logistic Regression': [lr_accuracy, lr_precision, lr_recall, lr_f1],
    'Random Forest': [rf_accuracy, rf_precision, rf_recall, rf_f1],
    'Difference (RF - LR)': [rf_accuracy - lr_accuracy, rf_precision - lr_precision, 
                            rf_recall - lr_recall, rf_f1 - lr_f1]
})

print("=== COMPARISON TABLE ===")
print(comparison_df.round(4))

print(f"\n=== ANALYSIS ===")
print(f"• Random Forest Accuracy: {rf_accuracy:.4f} vs Logistic Regression: {lr_accuracy:.4f}")
print(f"• Random Forest Precision: {rf_precision:.4f} vs Logistic Regression: {lr_precision:.4f}")
print(f"• Random Forest Recall: {rf_recall:.4f} vs Logistic Regression: {lr_recall:.4f}")

if rf_accuracy > lr_accuracy:
    print(f" Random Forest performs better with {(rf_accuracy - lr_accuracy)*100:.2f}% higher accuracy")
else:
    print(f" Logistic Regression performs better with {(lr_accuracy - rf_accuracy)*100:.2f}% higher accuracy")

=== COMPARISON TABLE ===
      Metric  Logistic Regression  Random Forest  Difference (RF - LR)
0   Accuracy               0.7877         0.7821               -0.0056
1  Precision               0.7571         0.7273               -0.0299
2     Recall               0.7162         0.7568                0.0405
3   F1-Score               0.7361         0.7417                0.0056

=== ANALYSIS ===
• Random Forest Accuracy: 0.7821 vs Logistic Regression: 0.7877
• Random Forest Precision: 0.7273 vs Logistic Regression: 0.7571
• Random Forest Recall: 0.7568 vs Logistic Regression: 0.7162
❌ Logistic Regression performs better with 0.56% higher accuracy


## Exercise 2: Use Cross-Validation
Apply cross-validation on the pipeline instead of a single train/test split.

```python
from sklearn.model_selection import cross_val_score
```

*Is the model stable across folds?*

In [9]:
# Exercise 2: Cross-Validation Implementation
from sklearn.model_selection import cross_val_score, cross_validate
import numpy as np

# Use the entire dataset (X, y) instead of train/test split for cross-validation
print("=== CROSS-VALIDATION ANALYSIS ===")

# Test both Logistic Regression and Random Forest with cross-validation
models = {
    'Logistic Regression': clf_pipeline,
    'Random Forest': rf_pipeline
}

cv_results = {}

for model_name, pipeline in models.items():
    print(f"\n--- {model_name} ---")
    
    # Perform 5-fold cross-validation with multiple metrics
    cv_scores = cross_validate(
        pipeline, X, y, 
        cv=5, 
        scoring=['accuracy', 'precision', 'recall', 'f1'],
        return_train_score=True
    )
    
    # Store results
    cv_results[model_name] = cv_scores
    
    # Print detailed results
    print(f"Accuracy:  {cv_scores['test_accuracy'].mean():.4f} (±{cv_scores['test_accuracy'].std()*2:.4f})")
    print(f"Precision: {cv_scores['test_precision'].mean():.4f} (±{cv_scores['test_precision'].std()*2:.4f})")
    print(f"Recall:    {cv_scores['test_recall'].mean():.4f} (±{cv_scores['test_recall'].std()*2:.4f})")
    print(f"F1-Score:  {cv_scores['test_f1'].mean():.4f} (±{cv_scores['test_f1'].std()*2:.4f})")
    
    # Individual fold scores
    print(f"Individual Accuracy Scores: {[f'{score:.3f}' for score in cv_scores['test_accuracy']]}")

=== CROSS-VALIDATION ANALYSIS ===

--- Logistic Regression ---
Accuracy:  0.7890 (±0.0296)
Precision: 0.7399 (±0.0426)
Recall:    0.6958 (±0.0933)
F1-Score:  0.7161 (±0.0547)
Individual Accuracy Scores: ['0.782', '0.809', '0.781', '0.770', '0.803']

--- Random Forest ---
Accuracy:  0.8081 (±0.0429)
Precision: 0.7591 (±0.0525)
Recall:    0.7338 (±0.1133)
F1-Score:  0.7450 (±0.0680)
Individual Accuracy Scores: ['0.782', '0.803', '0.843', '0.792', '0.820']
Accuracy:  0.8081 (±0.0429)
Precision: 0.7591 (±0.0525)
Recall:    0.7338 (±0.1133)
F1-Score:  0.7450 (±0.0680)
Individual Accuracy Scores: ['0.782', '0.803', '0.843', '0.792', '0.820']


In [None]:
# Model Stability Analysis
print("\n=== MODEL STABILITY ANALYSIS ===")

# Calculate coefficient of variation (CV) for stability assessment
for model_name, scores in cv_results.items():
    print(f"\n--- {model_name} Stability ---")
    
    accuracy_cv = scores['test_accuracy'].std() / scores['test_accuracy'].mean()
    precision_cv = scores['test_precision'].std() / scores['test_precision'].mean()
    recall_cv = scores['test_recall'].std() / scores['test_recall'].mean()
    f1_cv = scores['test_f1'].std() / scores['test_f1'].mean()
    
    print(f"Coefficient of Variation (lower = more stable):")
    print(f"  • Accuracy:  {accuracy_cv:.4f}")
    print(f"  • Precision: {precision_cv:.4f}")
    print(f"  • Recall:    {recall_cv:.4f}")
    print(f"  • F1-Score:  {f1_cv:.4f}")
    
    # Stability assessment
    avg_cv = np.mean([accuracy_cv, precision_cv, recall_cv, f1_cv])
    if avg_cv < 0.05:
        stability = "Very Stable "
    elif avg_cv < 0.10:
        stability = "Stable "
    elif avg_cv < 0.15:
        stability = "Moderately Stable "
    else:
        stability = "Unstable "
    
    print(f"  Overall Stability: {stability} (Avg CV: {avg_cv:.4f})")

# Compare cross-validation vs single split results
print(f"\n=== COMPARISON: Cross-Validation vs Single Split ===")
print(f"Logistic Regression:")
print(f"  • Single Split Accuracy: {lr_accuracy:.4f}")
print(f"  • Cross-Val Accuracy:   {cv_results['Logistic Regression']['test_accuracy'].mean():.4f}")

print(f"Random Forest:")
print(f"  • Single Split Accuracy: {rf_accuracy:.4f}")
print(f"  • Cross-Val Accuracy:   {cv_results['Random Forest']['test_accuracy'].mean():.4f}")

print(f"\n💡 Conclusion: Cross-validation provides a more robust estimate of model performance!")


=== MODEL STABILITY ANALYSIS ===

--- Logistic Regression Stability ---
Coefficient of Variation (lower = more stable):
  • Accuracy:  0.0187
  • Precision: 0.0288
  • Recall:    0.0671
  • F1-Score:  0.0382
  Overall Stability: Very Stable ✅ (Avg CV: 0.0382)

--- Random Forest Stability ---
Coefficient of Variation (lower = more stable):
  • Accuracy:  0.0265
  • Precision: 0.0345
  • Recall:    0.0772
  • F1-Score:  0.0456
  Overall Stability: Very Stable ✅ (Avg CV: 0.0460)

=== COMPARISON: Cross-Validation vs Single Split ===
Logistic Regression:
  • Single Split Accuracy: 0.7877
  • Cross-Val Accuracy:   0.7890
Random Forest:
  • Single Split Accuracy: 0.7821
  • Cross-Val Accuracy:   0.8081

💡 Conclusion: Cross-validation provides a more robust estimate of model performance!


## Exercise 3: Add Feature Engineering
Add a new column to the Titanic data, such as `FamilySize = SibSp + Parch`, and evaluate if this feature improves the model.

```python
df['FamilySize'] = df['SibSp'] + df['Parch']
# Then include it in the feature list and re-run the pipeline
```

*Does the new feature improve the prediction metrics?*

In [11]:
# 1. Ajout de la nouvelle feature FamilySize
df['FamilySize'] = df['SibSp'] + df['Parch']

# 2. Mise à jour des features
features = ['Pclass', 'Sex', 'Age', 'Fare', 'Embarked', 'FamilySize']
target = 'Survived'

X = df[features]
y = df[target]

# 3. Mise à jour du pipeline de prétraitement
numeric_features = ['Age', 'Fare', 'FamilySize']
numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_features = ['Pclass', 'Sex', 'Embarked']
categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)
])

# 4. Pipeline complet avec RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# 5. Évaluation avec validation croisée
from sklearn.model_selection import cross_validate

print("=== EVALUATION AVEC LA FEATURE 'FamilySize' ===")
cv_scores = cross_validate(
    pipeline, X, y,
    cv=5,
    scoring=['accuracy', 'precision', 'recall', 'f1']
)

# 6. Résultats
print(f"Accuracy :  {cv_scores['test_accuracy'].mean():.4f}")
print(f"Precision:  {cv_scores['test_precision'].mean():.4f}")
print(f"Recall   :  {cv_scores['test_recall'].mean():.4f}")
print(f"F1-Score :  {cv_scores['test_f1'].mean():.4f}")


=== EVALUATION AVEC LA FEATURE 'FamilySize' ===
Accuracy :  0.8014
Precision:  0.7488
Recall   :  0.7337
F1-Score :  0.7383


## Exercise 4 (Bonus): Create a Streamlit Interface
Build a simple Streamlit UI to load the trained model and predict survival based on user input.

```python
# Example streamlit interface
import streamlit as st
import joblib
import pandas as pd

model = joblib.load("titanic_pipeline.pkl")
Pclass = st.selectbox("Pclass", [1, 2, 3])
Sex = st.selectbox("Sex", ["male", "female"])
Age = st.slider("Age", 0, 100, 25)
Fare = st.slider("Fare", 0.0, 500.0, 32.0)
Embarked = st.selectbox("Embarked", ["S", "C", "Q"])

if st.button("Predict"):
    X_new = pd.DataFrame([[Pclass, Sex, Age, Fare, Embarked]],
                         columns=["Pclass", "Sex", "Age", "Fare", "Embarked"])
    pred = model.predict(X_new)
    st.write("Prediction:", "Survived" if pred[0] == 1 else "Did not survive")
```

👉 *Try running your Streamlit app locally.*

## L'interface streamlit est ajouté dans le fichier exercice_1-titanic_streamlit_app.py et testé et ça marche bien

In [None]:
import streamlit as st
import pandas as pd
import joblib

#  Chargement du modèle
model = joblib.load("titanic_pipeline.pkl")

# Titre
st.title(" Titanic Survival Predictor")
st.markdown("Prédisez vos chances de survie à bord du Titanic.")

# Collecte des entrées utilisateur
st.header(" Entrez vos informations :")

col1, col2 = st.columns(2)

with col1:
    Pclass = st.selectbox("Classe (Pclass)", [1, 2, 3])
    Sex = st.selectbox("Sexe", ["male", "female"])
    Embarked = st.selectbox("Port d'embarquement", ["S", "C", "Q"])

with col2:
    Age = st.slider("Âge", 0, 100, 25)
    Fare = st.slider("Prix du billet (Fare)", 0.0, 600.0, 32.0)
    SibSp = st.number_input("Nombre de frères/soeurs ou conjoint(s) à bord (SibSp)", 0, 10, 0)
    Parch = st.number_input("Nombre de parents/enfants à bord (Parch)", 0, 10, 0)

# Calcul de la feature ingénierée
FamilySize = SibSp + Parch

# Affichage résumé
st.subheader(" Résumé des données saisies :")
st.write({
    "Pclass": Pclass,
    "Sex": Sex,
    "Age": Age,
    "Fare": Fare,
    "Embarked": Embarked,
    "FamilySize": FamilySize
})

#  Prédiction
if st.button(" Prédire la survie"):
    input_df = pd.DataFrame([{
        "Pclass": Pclass,
        "Sex": Sex,
        "Age": Age,
        "Fare": Fare,
        "Embarked": Embarked,
        "FamilySize": FamilySize
    }])
    
    pred = model.predict(input_df)[0]
    proba = model.predict_proba(input_df)[0][pred]

    if pred == 1:
        st.success(f" Vous auriez survécu ! (probabilité : {proba:.2%})")
    else:
        st.error(f" Vous n'auriez pas survécu. (probabilité : {proba:.2%})")

# Footer
st.markdown("---")
st.markdown(" *Modèle entraîné sur les données publiques du Titanic avec sklearn.*")
