In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


In [2]:
!pip install mlflow dagshub

Collecting mlflow
  Downloading mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting mlflow-skinny==2.22.0 (from mlflow)
  Downloading mlflow_skinny-2.22.0-py3-none-any.whl.metadata (31 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.22.0->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.22.0->mlflow)
  Downloading uvicorn-0.34.2-py3-none-any.whl.metadata (6.5 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1

In [3]:
df_train_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
df_train_tr = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

In [4]:
df_test_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_identity.csv')
df_test_tr = pd.read_csv('/kaggle/input/ieee-fraud-detection/test_transaction.csv')
df_test_id.columns = df_test_id.columns.str.replace('-', '_', regex=False)

In [5]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline

In [6]:
pd.set_option('display.max_columns', None)  
pd.set_option('display.width', None)        
pd.set_option('display.expand_frame_repr', False)

In [7]:
df_train = pd.merge(df_train_tr, df_train_id, on='TransactionID', how='left')
df_test = pd.merge(df_test_tr, df_test_id, on='TransactionID', how='left')

In [8]:
X=df_train.drop(columns=['isFraud', 'TransactionID', 'TransactionDT'])
y=df_train['isFraud']

**Cleaning**

In [9]:
from sklearn.base import BaseEstimator, TransformerMixin

class CustomNullCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.8):
        self.threshold = threshold
        self.cols_to_drop = []

    def fit(self, X, y=None):
        null_fraction = X.isnull().mean()
        self.cols_to_drop = null_fraction[null_fraction >= self.threshold].index.tolist()
        return self

    def transform(self, X):
        df = X.copy()
        df.drop(columns=self.cols_to_drop, inplace=True, errors='ignore')

        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
        categoric_cols = df.select_dtypes(include=['object']).columns

        df[numeric_cols] = df[numeric_cols].fillna(0)
        df[categoric_cols] = df[categoric_cols].fillna("NO")

        return df

In [10]:
cleaner = CustomNullCleaner(threshold=0.8)
X_cleaned = cleaner.fit_transform(X)

In [11]:
X_cleaned.isnull().sum().sum()

0

**Freature Selection**

In [12]:
class CorrelationFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.95):
        self.threshold = threshold
        self.features_to_drop = []

    def fit(self, X, y=None):
        # Only numeric columns
        X_num = X.select_dtypes(include=[np.number])

        # Drop constant columns (zero variance)
        X_num = X_num.loc[:, X_num.std() > 0]

        # Drop columns that are completely NaN
        X_num = X_num.dropna(axis=1, how='all')

        # Now safe to compute correlations
        corr_matrix = X_num.corr().abs()
        upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

        self.features_to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > self.threshold)]

        return self

    def transform(self, X):
        return X.drop(columns=self.features_to_drop, errors='ignore')


In [13]:
class VarianceFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.01):
        self.threshold = threshold
        self.selector = VarianceThreshold(threshold=self.threshold)
        self.numeric_cols = []

    def fit(self, X, y=None):
        # Select numeric columns
        self.numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
        self.selector.fit(X[self.numeric_cols])
        return self

    def transform(self, X):
        X_copy = X.copy()

        # Apply variance filtering only on numeric columns
        numeric_filtered = self.selector.transform(X_copy[self.numeric_cols])
        selected_numeric_cols = np.array(self.numeric_cols)[self.selector.get_support()]

        # Create a DataFrame for selected numeric features
        X_numeric = pd.DataFrame(numeric_filtered, columns=selected_numeric_cols, index=X_copy.index)

        # Select categorical columns (unchanged)
        categorical_cols = X_copy.select_dtypes(exclude=[np.number]).columns
        X_categorical = X_copy[categorical_cols]

        # Combine numeric and categorical features back together
        X_filtered = pd.concat([X_numeric, X_categorical], axis=1)

        return X_filtered


In [14]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import VarianceThreshold

var_filter = VarianceFilter(threshold=0.01)
X_var_filtered = var_filter.fit_transform(X_cleaned)

# 3. Correlation filter
corr_filter = CorrelationFilter(threshold=0.95)
X_selected = corr_filter.fit_transform(X_var_filtered)

  return op(a, b)


**Feature Engineering**

In [15]:
!pip install category_encoders
import category_encoders as ce


# Step 2: Detect feature types AFTER cleaning
categorical_features = X_cleaned.select_dtypes(include=['object']).columns.tolist()
numerical_features = X_cleaned.select_dtypes(include=[np.number]).columns.tolist()

# Step 3: Define your pipelines
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('target_encoder', ce.TargetEncoder())
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numerical_features),
    ('cat', categorical_pipeline, categorical_features)
])



In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

model = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=1),
    n_estimators=100,
    learning_rate=1.0,
    algorithm='SAMME.R',
    random_state=42
)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selector', SelectFromModel(
        estimator=model,
        max_features=250, 
        threshold=-np.inf 
    )),
    ('model', model)
])


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Fit the pipeline
pipeline.fit(X_train, y_train)

# Predict & evaluate
val_preds = pipeline.predict_proba(X_val)[:, 1]
auc = roc_auc_score(y_val, val_preds)
print(f"Validation AUC: {auc:.4f}")



Validation AUC: 0.8839


In [18]:
# X_test = df_test.drop(columns=['TransactionID', 'TransactionDT'])


# test_preds = pipeline.predict_proba(X_test)[:, 1]
# submission = pd.DataFrame({
#     'TransactionID': df_test['TransactionID'],
#     'isFraud': test_preds
# })
# submission.to_csv('submission.csv', index=False)


In [19]:
# submission.head()

In [21]:
import dagshub
dagshub.init(repo_owner='agasi22', repo_name='ml-2', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=98263521-4ff3-4750-b881-bfe02bc629b3&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=d7068a3657d6d6c4fafd40a3a92d5f82cf3f0998987659a26d2beab255f912e8




In [23]:
import mlflow
import mlflow.sklearn
from sklearn.metrics import mean_squared_error
import numpy as np

# Initialize MLflow
mlflow.set_experiment('AdaBoost')

run_name = 'AdaBoost_hyperparameters_change'
with mlflow.start_run(run_name=run_name):
    # Log model hyperparameters
    mlflow.log_param("estimator", "AdaBoost")
    mlflow.log_param("scaler", "StandardScaler")
    mlflow.log_param("max_features_in_selector", 250)
    mlflow.log_param("n_estimators", model.n_estimators)
    mlflow.log_param("learning_rate", model.learning_rate)
    mlflow.log_param("algorithm", model.algorithm)

    # Log evaluation metric
    mlflow.log_metric("auc", auc)
    

    # Log the full pipeline model
    mlflow.sklearn.log_model(pipeline, "model")



🏃 View run AdaBoost_hyperparameters_change at: https://dagshub.com/agasi22/ml-2.mlflow/#/experiments/4/runs/faeb147c0025494a88675674ce6e85b7
🧪 View experiment at: https://dagshub.com/agasi22/ml-2.mlflow/#/experiments/4
