In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
import logging
import mlflow
import mlflow.sklearn
import dagshub

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


class CFG:
    train_path = "C:/Users/ARKO BERA/OneDrive/Desktop/MLOPS/Titanic_Survival_Prediction/data/train.csv"
    test_path = "C:/Users/ARKO BERA/OneDrive/Desktop/MLOPS/Titanic_Survival_Prediction/data/test.csv"
    sub_path = "C:/Users/ARKO BERA/OneDrive/Desktop/MLOPS/Titanic_Survival_Prediction/data/gender_submission.csv"
    target = "Survived"
    drop_col = ["Name","Ticket","PassengerId"]

cfg = CFG()

logger.info("Reading training and test data...")
data = pd.read_csv(cfg.train_path)
test_data = pd.read_csv(cfg.test_path)

logger.info("Combining datasets for feature engineering...")
combined = pd.concat([data, test_data], axis=0, ignore_index=True)
combined['IsTest'] = combined['Survived'].isnull()

logger.info("Extracting titles from names...")
combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
combined['Title'] = combined['Title'].replace(['Lady', 'Countess','Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')
combined['Title'] = combined['Title'].replace(['Mlle', 'Ms'], 'Miss')
combined['Title'] = combined['Title'].replace('Mme', 'Mrs')

logger.info("Creating family size and travel-alone features...")
combined['FamilySize'] = combined['SibSp'] + combined['Parch'] + 1
combined['IsAlone'] = 0
combined.loc[combined['FamilySize'] == 1, 'IsAlone'] = 1

logger.info("Extracting cabin class...")
combined['Cabin'] = combined['Cabin'].fillna('X')
combined['CabinClass'] = combined['Cabin'].str[0]

logger.info("Filling missing Fare values and engineering FarePerPerson...")
combined['Fare'] = combined.groupby('Pclass')['Fare'].transform(lambda x: x.fillna(x.median()))
combined['FarePerPerson'] = combined['Fare'] / combined['FamilySize']

logger.info("Filling missing Age values with grouped medians...")
age_map = combined.groupby(['Title', 'Pclass', 'Sex'])['Age'].median().reset_index()
for idx in combined[combined['Age'].isnull()].index:
    title = combined.loc[idx, 'Title']
    pclass = combined.loc[idx, 'Pclass']
    sex = combined.loc[idx, 'Sex']
    
    mask = (age_map['Title'] == title) & (age_map['Pclass'] == pclass) & (age_map['Sex'] == sex)
    if age_map[mask].shape[0] > 0:
        pred_age = age_map[mask]['Age'].values[0]
    else:
        pred_age = combined['Age'].median()
    
    combined.loc[idx, 'Age'] = pred_age

logger.info("Creating AgeGroup bins...")
combined['AgeGroup'] = pd.cut(combined['Age'], bins=[0, 12, 18, 30, 50, 100], labels=['Child', 'Teen', 'YoungAdult', 'Adult', 'Senior'])

logger.info("Filling missing Embarked values...")
combined['Embarked'] = combined['Embarked'].fillna(combined['Embarked'].mode()[0])

logger.info("One-hot encoding categorical features...")
categorical_features = ['Title', 'CabinClass', 'AgeGroup', 'Sex', 'Embarked']
combined = pd.get_dummies(combined, columns=categorical_features)

logger.info("Selecting final features...")
selected_features = [
    'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'FarePerPerson', 
    'FamilySize', 'IsAlone', 'Title_Master', 'Title_Miss', 'Title_Mr', 
    'Title_Mrs', 'Title_Rare', 'CabinClass_A', 'CabinClass_B', 'CabinClass_C', 
    'CabinClass_D', 'CabinClass_E', 'CabinClass_F', 'CabinClass_G', 'CabinClass_T', 
    'CabinClass_X', 'AgeGroup_Child', 'AgeGroup_Teen', 'AgeGroup_YoungAdult', 
    'AgeGroup_Adult', 'AgeGroup_Senior', 'Sex_female', 'Sex_male', 
    'Embarked_C', 'Embarked_Q', 'Embarked_S'
]

logger.info("Splitting combined data into train and test sets...")
train_data = combined[~combined['IsTest']].copy()
test_data = combined[combined['IsTest']].copy()

logger.info("Preparing feature matrix and target variable...")
X = train_data[selected_features]
y = train_data['Survived'].astype(int)
X_test = test_data[selected_features]

logger.info("Creating logistic regression pipeline with standard scaling...")
model = make_pipeline(
    StandardScaler(),
    LogisticRegression(random_state=42, max_iter=1000)
)


# 模型调优 - 简化参数网格以加快训练速度
param_grid = {
    'logisticregression__C': [1, 10],
    'logisticregression__penalty': ['l2'],  # 简化为只使用l2惩罚
    'logisticregression__solver': ['liblinear'],  # 简化为只使用liblinear求解器
    'logisticregression__class_weight': [None, 'balanced']
}


mlflow.set_experiment("Logistic Regression Grid Search")

with mlflow.start_run(run_name="logreg_pipeline_run"):
    logger.info("Starting logistic regression training with preprocessing")

    # Fitting GridSearchCV
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)

    best_model = grid_search.best_estimator_
    best_score = grid_search.best_score_
    best_params = grid_search.best_params_

    logger.info(f"Best Parameters: {best_params}")
    logger.info(f"Best Cross-Validation Score: {best_score:.4f}")

    # Log best parameters manually
    for param, value in best_params.items():
        mlflow.log_param(param, value)

    mlflow.log_metric("cv_accuracy", best_score)

    # Optionally log the model
    mlflow.sklearn.log_model(best_model, "logreg_model")

    # Make predictions and save submission
    predictions = best_model.predict(X_test).astype(int)
    submission = pd.DataFrame({
        "PassengerId": test_data["PassengerId"],
        "Survived": predictions
    })
    submission_path = "submission_logreg.csv"
    submission.to_csv(submission_path, index=False)
    mlflow.log_artifact(submission_path)

    logger.info("Pipeline completed and submission saved.")


  combined['Title'] = combined['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)
2025-06-19 19:32:28,107 - INFO - Reading training and test data...
2025-06-19 19:32:28,112 - INFO - Combining datasets for feature engineering...
2025-06-19 19:32:28,117 - INFO - Extracting titles from names...
2025-06-19 19:32:28,120 - INFO - Creating family size and travel-alone features...
2025-06-19 19:32:28,122 - INFO - Extracting cabin class...
2025-06-19 19:32:28,125 - INFO - Filling missing Fare values and engineering FarePerPerson...
2025-06-19 19:32:28,130 - INFO - Filling missing Age values with grouped medians...
2025-06-19 19:32:28,372 - INFO - Creating AgeGroup bins...
2025-06-19 19:32:28,373 - INFO - Filling missing Embarked values...
2025-06-19 19:32:28,375 - INFO - One-hot encoding categorical features...
2025-06-19 19:32:28,378 - INFO - Selecting final features...
2025-06-19 19:32:28,379 - INFO - Splitting combined data into train and test sets...
2025-06-19 19:32:28,380 - INFO - Prepar

2025-06-19 19:32:29,215 - INFO - Accessing as arkobera
2025-06-19 19:32:29,844 - INFO - HTTP Request: GET https://dagshub.com/api/v1/repos/arkobera/Titanic_Survival_Prediction "HTTP/1.1 200 OK"
2025-06-19 19:32:30,419 - INFO - HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"


2025-06-19 19:32:30,426 - INFO - Initialized MLflow to track repo "arkobera/Titanic_Survival_Prediction"


2025-06-19 19:32:30,427 - INFO - Repository arkobera/Titanic_Survival_Prediction initialized!
2025/06/19 19:32:31 INFO mlflow.tracking.fluent: Experiment with name 'Logistic Regression Grid Search' does not exist. Creating a new experiment.
2025-06-19 19:32:32,037 - INFO - Starting logistic regression training with preprocessing
2025-06-19 19:32:34,402 - INFO - Best Parameters: {'logisticregression__C': 1, 'logisticregression__class_weight': None, 'logisticregression__penalty': 'l2', 'logisticregression__solver': 'liblinear'}
2025-06-19 19:32:34,404 - INFO - Best Cross-Validation Score: 0.8216
2025-06-19 19:32:44,341 - INFO - Pipeline completed and submission saved.


🏃 View run logreg_pipeline_run at: https://dagshub.com/arkobera/Titanic_Survival_Prediction.mlflow/#/experiments/2/runs/d21c64a5795546ce83f0fbb5ee2258a6
🧪 View experiment at: https://dagshub.com/arkobera/Titanic_Survival_Prediction.mlflow/#/experiments/2
