In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

In [3]:
df = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

# DATA SPLIT

In [4]:
from sklearn.model_selection import train_test_split

unique_cards = df['card1'].unique()


card_train, card_val_test = train_test_split(unique_cards, test_size=0.4, random_state=42)
card_val, card_test = train_test_split(card_val_test, test_size=0.5, random_state=42)

train = df[df['card1'].isin(card_train)]
val = df[df['card1'].isin(card_val)]
test = df[df['card1'].isin(card_test)]

In [5]:
print(train['isFraud'].mean(), val['isFraud'].mean(), test['isFraud'].mean())

print(len(set(train['card1']) & set(val['card1'])))  
print(len(set(train['card1']) & set(test['card1']))) 

0.03513108235162511 0.03890206572198909 0.03122020767664969
0
0


In [6]:
train_ids = train.pop('TransactionID')
y_train = train.pop('isFraud')
x_train = train

val_ids = val.pop('TransactionID')
y_val = val.pop('isFraud')
x_val = val

test_ids = test.pop('TransactionID')
y_test = test.pop('isFraud')
x_test = test


# DATA CLEAN


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin
class SmartMissingHandler(BaseEstimator, TransformerMixin):
    def __init__(self, nan_threshold=0.8, fraud_ratio_threshold=2.5, verbose=True):
        self.nan_threshold = nan_threshold
        self.fraud_ratio_threshold = fraud_ratio_threshold
        self.verbose = verbose
        self.high_risk_cols_ = None
        self.cols_to_drop_ = None
        self.mean_fraud_rate_ = None
        
    def _print(self, message):
        if self.verbose:
            print(message)
            
    def fit(self, X, y=None):
        """Identify high-risk columns and columns to drop"""
        X = X.copy()
        self._print("\n=== Starting Fit ===")
        
        nan_rates = X.isnull().mean()
        self.cols_to_drop_ = nan_rates[nan_rates >= self.nan_threshold].index.tolist()
        X = X.drop(columns=self.cols_to_drop_)
        
        if y is not None:
            self.mean_fraud_rate_ = y.mean()
            missing_analysis = []
            
            for col in X.columns:
                if X[col].isnull().sum() > 0:
                    fraud_missing = y[X[col].isnull()].mean()
                    fraud_not_missing = y[~X[col].isnull()].mean()
                    ratio = fraud_missing / max(fraud_not_missing, 1e-6)
                    
                    if ratio >= self.fraud_ratio_threshold:
                        missing_analysis.append({
                            'column': col,
                            'fraud_ratio': ratio,
                            'missing_rate': X[col].isnull().mean()
                        })
            
            self.high_risk_cols_ = [x['column'] for x in 
                                  sorted(missing_analysis, 
                                  key=lambda x: -x['fraud_ratio'])]
        
        return self
    
    def transform(self, X):
        """Apply transformations without DataFrame fragmentation"""
        X = X.copy()
        
        X = X.drop(columns=self.cols_to_drop_, errors='ignore')
        
        flag_data = {}
        impute_data = {}
        
        for col in (self.high_risk_cols_ or []):
            if col in X.columns:
                flag_data[f'{col}_MISSING'] = X[col].isnull().astype(int)
                if pd.api.types.is_numeric_dtype(X[col]):
                    impute_data[col] = X[col].fillna(-999)
                else:
                    impute_data[col] = X[col].fillna('MISSING_CAT')
        
        other_cols = [c for c in X.columns 
                     if c not in (self.high_risk_cols_ or []) 
                     and X[c].isnull().any()]
        
        for col in other_cols:
            mode_val = X[col].mode()[0] if not X[col].mode().empty else (0 if pd.api.types.is_numeric_dtype(X[col]) else 'MISSING')
            impute_data[col] = X[col].fillna(mode_val)
        
        X = X.assign(**impute_data)
        if flag_data:
            X = pd.concat([X, pd.DataFrame(flag_data, index=X.index)], axis=1)
        
        return X

# FEATURE SELECTION

In [8]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
import numpy as np

class CorrelationRemover(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.9, method='pearson', verbose=True):
        self.threshold = threshold
        self.method = method
        self.verbose = verbose
        self.cols_to_drop_ = []
        
    def fit(self, X, y=None):
        X = pd.DataFrame(X).copy()
        
        num_cols = X.select_dtypes(include=np.number).columns
        X_num = X[num_cols].replace([np.inf, -np.inf], np.nan)
        
        if X_num.isna().any().any():
            raise ValueError("Found NA values in supposedly clean numeric data")
            
        try:
            corr_matrix = X_num.corr(method=self.method).abs()
        except Exception as e:
            print(f"Correlation failed. Checking problematic columns...")
            problematic = []
            for col in num_cols:
                try:
                    X_num[col].astype(float)
                except:
                    problematic.append(col)
            raise ValueError(f"Non-numeric values found in columns: {problematic}")
        
        upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        self.cols_to_drop_ = [col for col in upper.columns 
                            if any(upper[col] >= self.threshold)]
        
        if self.verbose:
            print(f"Removing {len(self.cols_to_drop_)} features with {self.method} corr â‰¥ {self.threshold}")
        return self
    
    def transform(self, X):
        return pd.DataFrame(X).drop(columns=self.cols_to_drop_, errors='ignore')

In [None]:
!pip install -U scikit-learn==1.3.2 imbalanced-learn==0.11.0

# TRAINING

In [None]:
import pandas as pd
from sklearn.model_selection import PredefinedSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
from category_encoders import TargetEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, RocCurveDisplay, classification_report
import matplotlib.pyplot as plt


categorical_cols = [col for col in x_train.columns if x_train[col].dtype == 'object']
numerical_cols = [col for col in x_train.columns if x_train[col].dtype != 'object']

categorical_cols = x_train.select_dtypes(include=['object'])
filtered_cats = [col for col in categorical_cols.columns
                 if df[col].isnull().mean() < 0.8]

encoder = TargetEncoder(
    cols=filtered_cats, 
    smoothing=10.0 
)

missing_handler = SmartMissingHandler(
    nan_threshold=0.85,          
    fraud_ratio_threshold=2    
)

param_grid = {
    'clf__C': [0.01, 0.1]
}

X_full = pd.concat([x_train, x_val], axis=0)
y_full = pd.concat([y_train, y_val], axis=0)
split_index = [-1] * len(x_train) + [0] * len(x_val)
ps = PredefinedSplit(test_fold=split_index)



full_pipeline = ImbPipeline([
    ('missing', missing_handler),
    ('encoder', encoder),
    ('scaler', StandardScaler()),
    ('selector', VarianceThreshold(threshold=0.01)),
    ('correlation_remover', CorrelationRemover(threshold=0.95)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear', random_state=42))
])




In [None]:

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', TargetEncoder(smoothing=10.0))
])

preprocessor = ColumnTransformer([
    ('num', num_pipeline, numerical_cols),
    ('cat', cat_pipeline, categorical_cols)
])

full_pipeline = ImbPipeline([
    ('preprocess', preprocessor),
    ('scaler', StandardScaler()),
    ('selector', VarianceThreshold(threshold=0.01)),
    ('correlation_remover', CorrelationRemover(threshold=0.95)),
    ('undersample', RandomUnderSampler(random_state=42)),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear', random_state=42))
])

param_grid = {
    'clf__C': [0.1, 1]
}


In [None]:
grid = GridSearchCV(
    estimator=full_pipeline,
    param_grid=param_grid,
    cv=ps,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=3,
    error_score='raise'
)

grid.fit(X_full, y_full)

best_model = grid.best_estimator_
print(f"Best params: {grid.best_params_}")
print(f"Validation roc-auc: {grid.best_score_:.4f}")

y_train_proba = best_model.predict_proba(x_train)[:, 1]
train_roc_auc = roc_auc_score(y_train, y_train_proba)


y_test_proba = best_model.predict_proba(x_test)[:, 1]
test_roc_auc = roc_auc_score(y_test, y_test_proba)

print(f"Train ROC-AUC: {train_roc_auc:.4f}")

print(f"Test ROC-AUC: {test_roc_auc:.4f}")


y_test_pred = (y_test_proba > 0.5).astype(int)


print(classification_report(y_test, y_test_pred))

RocCurveDisplay.from_predictions(y_test, y_test_proba)
plt.title("Test ROC Curve")
plt.grid(True)
plt.show()


# MLFLOW LOGGING

In [None]:
!pip install mlflow dagshub

In [None]:
import dagshub
dagshub.init(repo_owner='abarb22', repo_name='IEEE-fraud-detection', mlflow=True)

In [None]:
import mlflow
from sklearn.metrics import roc_auc_score, classification_report, RocCurveDisplay, precision_score, recall_score, f1_score
import mlflow.sklearn
import os
import matplotlib.pyplot as plt



mlflow.set_experiment("LogisticRegression_Training")

with mlflow.start_run(run_name="LogReg_Trial_2") as parent_run:
    with mlflow.start_run(run_name="Preprocessing", nested=True):
        mlflow.log_param("numerical_columns", numerical_cols)
        mlflow.log_param("categorical_columns", categorical_cols)

        
        # Log TargetEncoder parameter
        mlflow.log_param("target_encoder_smoothing", 10.0)
        mlflow.log_metric("n_missing_train", x_train.isnull().sum().sum())

    with mlflow.start_run(run_name="LR_FeatureSelection", nested=True):
        mlflow.log_param("variance_threshold", 0.01)
        mlflow.log_param("correlation_threshold", 0.95)

    with mlflow.start_run(run_name="LR_CV_Training",  nested=True):
        # Log search space and best hyperparameters
        mlflow.log_params({f"grid__{k}": v for k, v in param_grid.items()})
        mlflow.log_param("model", "LogisticRegression")
        mlflow.log_param("solver", "liblinear")
        mlflow.log_param("max_iter", 1000)
        mlflow.log_param("cv_type", "PredefinedSplit")
        mlflow.log_param("undersampling", "RandomUnderSampler")
        mlflow.log_param("C_values_grid", param_grid['clf__C'])
        mlflow.log_metric("val_roc_auc", grid.best_score_)
        mlflow.log_param("best_C", grid.best_params_)
 

    with mlflow.start_run(run_name="LR_FinalModel" , nested=True):
        mlflow.log_metric("train_roc_auc", train_roc_auc)
        mlflow.log_metric("test_roc_auc", test_roc_auc)
        mlflow.log_metric("val_roc_auc", grid.best_score_)
    
        y_test_pred = (y_test_proba > 0.5).astype(int)
        mlflow.log_metric("test_precision", precision_score(y_test, y_test_pred))
        mlflow.log_metric("test_recall", recall_score(y_test, y_test_pred))
        mlflow.log_metric("test_f1", f1_score(y_test, y_test_pred))

        # Log classification report as text
        report = classification_report(y_test, y_test_pred)
        mlflow.log_text(report, "classification_report.txt")

        # Log model
        mlflow.sklearn.log_model(best_model, "final_model")

        # Log ROC curve plot
        RocCurveDisplay.from_predictions(y_test, y_test_proba)
        plt.title("Test ROC Curve")
        plt.grid(True)
        plt.savefig("roc_curve.png")
        mlflow.log_artifact("roc_curve.png")
        

