In [1]:
!pip install --upgrade scikit-learn imbalanced-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.14.0-py3-none-any.whl.metadata (8.8 kB)
Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m55.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imbalanced_learn-0.14.0-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn, imbalanced-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
  Attempting uninstall: imbalanced-learn
    Found existing installation: imbalanced-learn 0.13.0
    

In [2]:
import sklearn
import imblearn

print(f"Scikit-Learn Version: {sklearn.__version__}")
print(f"Imbalanced-Learn Version: {imblearn.__version__}")

# If Scikit-Learn is 1.3 or higher, the error will disappear.


Scikit-Learn Version: 1.7.2
Imbalanced-Learn Version: 0.14.0


In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE 
import joblib
import os
print("Files in input folder:")
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Files in input folder:
/kaggle/input/manthan-ai-2025-ai-summit/CONTESTANT_GUIDE.txt
/kaggle/input/manthan-ai-2025-ai-summit/telecom_churn_test.csv
/kaggle/input/manthan-ai-2025-ai-summit/industrial_maintenance_test.csv
/kaggle/input/manthan-ai-2025-ai-summit/industrial_maintenance_train.csv
/kaggle/input/manthan-ai-2025-ai-summit/ecommerce_fraud_test.csv
/kaggle/input/manthan-ai-2025-ai-summit/ecommerce_fraud_train.csv
/kaggle/input/manthan-ai-2025-ai-summit/telecom_churn_train.csv


In [4]:
import pandas as pd

# Load the specific Fraud Training file
file_path = '/kaggle/input/manthan-ai-2025-ai-summit/ecommerce_fraud_train.csv'
df = pd.read_csv(file_path)

# Show the column names and the first 5 rows
print("Dataset Loaded Successfully!")
print("Columns in the table:", df.columns.tolist())
display(df.head())

Dataset Loaded Successfully!
Columns in the table: ['hour_of_day', 'day_of_week', 'month', 'time_since_last_tx', 'tx_velocity', 'tx_amount', 'account_age_days', 'lifetime_spent', 'num_failed_tx', 'credit_score', 'network_centrality', 'shared_devices', 'shared_ips', 'desc_length', 'special_chars', 'num_urls', 'sentiment', 'isolation_score', 'lof_score', 'ae_recon_error', 'fraud_prob_hidden', 'user_id', 'device_type', 'browser', 'ip_country', 'vpn_detected', 'category_l1', 'category_l2', 'category_l3', 'language', 'is_fraud']


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,hour_of_day,day_of_week,month,time_since_last_tx,tx_velocity,tx_amount,account_age_days,lifetime_spent,num_failed_tx,credit_score,...,user_id,device_type,browser,ip_country,vpn_detected,category_l1,category_l2,category_l3,language,is_fraud
0,17.0,2.0,5.0,12.524688,2.0,105.580627,125.907932,23.98815,0.0,454.505232,...,1361,mobile,chrome,40,0,6,48,189,16,1
1,17.0,2.0,6.0,35.688835,2.0,883.643222,121.819486,213.938693,1.0,702.969631,...,462,mobile,chrome,8,0,9,26,189,17,0
2,16.0,5.0,10.0,0.91396,6.0,2.293489,130.658257,278.252111,3.0,442.101162,...,494,mobile,safari,30,0,0,33,163,7,0
3,22.0,5.0,8.0,6.117373,1.0,5.651249,68.144708,5212.387041,2.0,,...,470,mobile,safari,19,0,4,36,172,2,0
4,22.0,2.0,6.0,35.148541,2.0,30.612697,323.83348,1.838082,0.0,696.196839,...,1530,tablet,other,30,0,0,45,74,16,0


In [5]:
# 1. Prepare the Data
# Drop 'is_fraud' (the answer) and 'user_id' (useless for prediction)
X = df.drop(['is_fraud', 'user_id'], axis=1)
y = df['is_fraud']

# 2. Clean the Data
# Fill missing values (like empty credit scores) with 0
X = X.fillna(0)

# Convert text columns (like 'mobile', 'chrome') into numbers
X = pd.get_dummies(X, drop_first=True)

print("Starting SMOTE (This handles the imbalance)...")
# 3. Apply SMOTE to balance the data (The B.Tech requirement)
from imblearn.over_sampling import SMOTE 
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X, y)

print(f"Data Balanced! Fraud cases increased from {sum(y==1)} to {sum(y_res==1)}")

# 4. Train the Model
print("Training Random Forest (This might take 1 minute)...")
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=50, random_state=42)
model.fit(X_res, y_res)

# 5. Save the Model & Column Names (Important for the App!)
import joblib
joblib.dump(model, 'fraud_model.pkl')
joblib.dump(X.columns, 'model_columns.pkl')

print("SUCCESS! 'fraud_model.pkl' has been saved.")

Starting SMOTE (This handles the imbalance)...
Data Balanced! Fraud cases increased from 87 to 7913
Training Random Forest (This might take 1 minute)...
SUCCESS! 'fraud_model.pkl' has been saved.
