In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [3]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import numpy as np
import dagshub

# 1. Set up MLflow experiment
dagshub.init(repo_owner='ashar-22', repo_name='hw02ml', mlflow=True)
experiment_name = 'logreg_experiment'
mlflow.set_experiment(experiment_name)

train_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
train_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')
# test_identity = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
# test_transaction = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')

train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
# test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')

features = [col for col in train.columns if col not in ["TransactionID", "isFraud"]]
X = train[features]
y = train["isFraud"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Data Cleaning

In [7]:
with mlflow.start_run(run_name="LogReg_Cleaning"):
    threshold = 0.75
    
    missing_ratio = X_train.isnull().mean()
    cols_to_drop = missing_ratio[missing_ratio > threshold].index.tolist()
    X_train.drop(columns=cols_to_drop, inplace=True)
    X_valid.drop(columns=cols_to_drop, inplace=True)
    
    mlflow.log_metric("initial_missing_ratio", missing_ratio.mean())
    mlflow.log_param("drop_threshold", threshold)
    mlflow.log_param("num_cols_dropped", len(cols_to_drop))

🏃 View run LogReg_Cleaning at: https://dagshub.com/ashar-22/hw02ml.mlflow/#/experiments/0/runs/414d7d037287495382d182a301e4e686
🧪 View experiment at: https://dagshub.com/ashar-22/hw02ml.mlflow/#/experiments/0


# Feature Engineering

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FunctionTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

numerical_cols = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()

with mlflow.start_run(run_name="LogReg_Feature_Engineering"):
    def add_transaction_amt_log(X):
        X = X.copy()
        if "TransactionAmt" in X.columns:
            X["TransactionAmt_log"] = np.log1p(X["TransactionAmt"])
        return X

    log_transformer = FunctionTransformer(add_transaction_amt_log, validate=False)

    numerical_cols = [col for col in X_train.columns if X_train[col].dtype in ['int64', 'float64']]
    categorical_cols = [col for col in X_train.columns if X_train[col].dtype == 'object']
    
    numerical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),  # Impute missing values with the median for numerical features
        ("scaler", StandardScaler())  # Apply scaling
    ])
    
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),  # Impute missing values with the most frequent category
        ("encoder", OneHotEncoder(handle_unknown="ignore", sparse_output=False))  # Apply one-hot encoding
    ])


    if "TransactionAmt" in numerical_cols:
        mlflow.log_param("new_features", "TransactionAmt_log")

🏃 View run LogReg_Feature_Engineering at: https://dagshub.com/ashar-22/hw02ml.mlflow/#/experiments/0/runs/0cf6aa7359d54a22b161320983049cc6
🧪 View experiment at: https://dagshub.com/ashar-22/hw02ml.mlflow/#/experiments/0


# Feature Selection

In [9]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
import numpy as np
import mlflow

with mlflow.start_run(run_name="LogReg_Feature_Selection"):
    variance_threshold = VarianceThreshold(threshold=0.01)  # Adjust threshold as needed
    X_train_variance_filtered = variance_threshold.fit_transform(X_train[numerical_cols].fillna(0))

    remaining_numerical_cols = np.array(numerical_cols)[variance_threshold.get_support()]

    k_best = 90
    selector = SelectKBest(score_func=f_classif, k=k_best)
    selector.fit(X_train_variance_filtered, y_train)

    selected_features = remaining_numerical_cols[selector.get_support()].tolist()

    if "TransactionAmt" in selected_features:
        selected_features.append("TransactionAmt_log")

    mlflow.log_param("num_selected_features", len(selected_features))
    mlflow.log_param("selected_features", ", ".join(selected_features))


🏃 View run LogReg_Feature_Selection at: https://dagshub.com/ashar-22/hw02ml.mlflow/#/experiments/0/runs/9a2e486473f444d1bfd0a16caa553d66
🧪 View experiment at: https://dagshub.com/ashar-22/hw02ml.mlflow/#/experiments/0


# Training

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
import mlflow

preprocessor = ColumnTransformer([
    ("num", numerical_transformer, numerical_cols),
    ("cat", categorical_transformer, categorical_cols)
])

lr = LogisticRegression(random_state=42, max_iter=1000)

pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("classifier", lr)
])

with mlflow.start_run(run_name="logReg_Training"):
    pipeline.fit(X_train, y_train)

    y_train_pred = pipeline.predict_proba(X_train)[:, 1]
    y_valid_pred = pipeline.predict_proba(X_valid)[:, 1]

    train_auc = roc_auc_score(y_train, y_train_pred)
    valid_auc = roc_auc_score(y_valid, y_valid_pred)

    mlflow.log_metrics({"train_auc": train_auc, "valid_auc": valid_auc})

    mlflow.sklearn.log_model(pipeline, "fraud_pipeline")

    print(f"Train AUC: {train_auc:.4f}")
    print(f"Validation AUC: {valid_auc:.4f}")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train AUC: 0.8452
Validation AUC: 0.8409
🏃 View run logReg_Training at: https://dagshub.com/ashar-22/hw02ml.mlflow/#/experiments/0/runs/c8df10b9d6f54d87a305d55ad6066f5d
🧪 View experiment at: https://dagshub.com/ashar-22/hw02ml.mlflow/#/experiments/0
