In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/credit-debit dataset.csv
/kaggle/input/dataset/TransactionDataset1.csv


In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder, MultiLabelBinarizer
from sklearn.impute import SimpleImputer
import joblib
import os



In [5]:
# Function to preprocess the data
def preprocess_data(data):
    # Select relevant features
    selected_features = ['age', 'kyc_status', 'days_since_kyc_incomplete', 'transaction_amount',
                         'home_branch', 'transaction_location', 'transaction_method',
                         'transaction_category', 'transaction_merchant', 'transaction_time',
                         'average_expenditure', 'comparison_with_avg_expenditure',
                         'transaction_count_7_days', 'suspicion_indicator',
                         'Total Credit Amount', 'Transaction Amount']

    # Drop irrelevant columns
    data = data[selected_features + ['fraud_indicator']]

    # Handle missing values
    data = data.dropna()

    # Label encoding for categorical variables
    label_encoder = LabelEncoder()
    data[data.select_dtypes(include=['object']).columns] = data.select_dtypes(include=['object']).apply(lambda col: label_encoder.fit_transform(col.astype(str)))

    return data, label_encoder

# Load datasets
data1_path = '/kaggle/input/dataset/TransactionDataset1.csv'
data2_path = '/kaggle/input/dataset/credit-debit dataset.csv'

# Check if files exist before attempting to read
if not os.path.isfile(data1_path):
    print(f"Error: File not found - {data1_path}")
else:
    data1 = pd.read_csv(data1_path)

if not os.path.isfile(data2_path):
    print(f"Error: File not found - {data2_path}")
else:
    data2 = pd.read_csv(data2_path)

# Preprocess data
preprocessed_data, label_encoder = preprocess_data(pd.concat([data1, data2], axis=1))

# Separate features and target variable
X = preprocessed_data.drop('fraud_indicator', axis=1)
y = preprocessed_data['fraud_indicator']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use SimpleImputer to handle missing values by filling NaNs with the mean
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Choose a model (Random Forest)
model = RandomForestClassifier(n_estimators=2000, random_state=42, verbose=1)

# Train the model
model.fit(X_train_scaled, y_train)

# Display the features
print("Features of the RandomForest model:")
for feature_name in X.columns:
    print(feature_name)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Display confusion matrix and classification report
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Confusion Matrix:")
print(conf_matrix)
print("\nClassification Report:")
print(classification_rep)

# Save the model, scaler, and label encoder
model_filename = '/kaggle/working/random_forest_fraud.pkl'
joblib.dump({
    'label_encoder': label_encoder,
    'scaler': scaler,
    'model': model,
    'features': X.columns.tolist()  # Save the features used for training
}, model_filename)

print(f'Model, scaler, and label encoder saved as {model_filename}')


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.6s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    6.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:   14.5s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:   25.7s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:   40.0s
[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:   57.6s


Features of the RandomForest model:
age
kyc_status
days_since_kyc_incomplete
transaction_amount
home_branch
transaction_location
transaction_method
transaction_category
transaction_merchant
transaction_time
average_expenditure
comparison_with_avg_expenditure
transaction_count_7_days
suspicion_indicator
Total Credit Amount
Transaction Amount


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 1249 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 1799 tasks      | elapsed:    0.8s


Confusion Matrix:
[[2562    0]
 [ 118 1320]]

Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2562
           1       1.00      0.92      0.96      1438

    accuracy                           0.97      4000
   macro avg       0.98      0.96      0.97      4000
weighted avg       0.97      0.97      0.97      4000

Model, scaler, and label encoder saved as /kaggle/working/random_forest_fraud.pkl
