In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/dataset/credit-debit dataset.csv
/kaggle/input/dataset/TransactionDataset1.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer

# Load the first dataset
data = pd.read_csv('/kaggle/input/dataset/TransactionDataset1.csv')

# Drop unnecessary columns
drop_columns = ['user_id', 'name', 'addresses', 'email_address', 'transaction_id', 'transaction_date']
data = data.drop(drop_columns, axis=1)

# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

# Split the data into features (X) and target variable (y)
X = data.drop('fraud_indicator', axis=1)
y = data['fraud_indicator']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Choose a model (Random Forest)
model = RandomForestClassifier(n_estimators=1000, random_state=42, verbose=1)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rsquared = r2_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results for the first model
print(f'First Model Results:')
print(f'Accuracy: {accuracy:.2f}')
print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {rsquared:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)

# Read the second dataset into df2
df2 = pd.read_csv('/kaggle/input/dataset/credit-debit dataset.csv')  

# Feature columns for the second dataset
features_second = ['Total Credit Amount', 'Transaction Amount']

# Additional feature columns for money sources and transaction accounts
money_sources = 'Money Sources'
transaction_accounts = 'Transfer Accounts'

# Target variable for the second dataset
target_second = 'Fraud Indicator'

# Drop rows with missing values
df2 = df2.dropna()

# Convert categorical variables to dummy/indicator variables
df2 = pd.get_dummies(df2, columns=['Employment Status', 'Education Level', 'Marital Status'], drop_first=True)

# Use MultiLabelBinarizer for one-hot encoding of accounts
mlb = MultiLabelBinarizer()

# Transform 'money_sources' and 'transaction_accounts' into binary features
money_sources_encoded = pd.DataFrame(mlb.fit_transform(df2['Money Sources']), columns=mlb.classes_, index=df2.index)
transaction_accounts_encoded = pd.DataFrame(mlb.fit_transform(df2['Transfer Accounts']), columns=mlb.classes_, index=df2.index)

# Concatenate the encoded features with the original DataFrame
df2 = pd.concat([df2, money_sources_encoded, transaction_accounts_encoded], axis=1)

# Feature columns for the second dataset including the new binary account features
features_second += list(money_sources_encoded.columns) + list(transaction_accounts_encoded.columns)

# Drop the original account columns
df2 = df2.drop(['Money Sources', 'Transfer Accounts'], axis=1)

# Split the data into training and testing sets
X_train_second, X_test_second, y_train_second, y_test_second = train_test_split(df2[features_second], df2[target_second], test_size=0.2, random_state=42)

# Train a Random Forest classifier for the second dataset
model_second = RandomForestClassifier(n_estimators=1000, random_state=42, verbose=1)
model_second.fit(X_train_second, y_train_second)

# Make predictions on the test set for the second model
predictions_second = model_second.predict(X_test_second)

# Evaluate the second model
accuracy_second = accuracy_score(y_test_second, predictions_second)
mse_second = mean_squared_error(y_test_second, predictions_second)
rsquared_second = r2_score(y_test_second, predictions_second)
conf_matrix_second = confusion_matrix(y_test_second, predictions_second)
classification_rep_second = classification_report(y_test_second, predictions_second)

# Display results for the second model
print(f'Second Model Results:')
print(f'Accuracy: {accuracy_second:.2f}')
print(f'Mean Squared Error: {mse_second:.2f}')
print(f'R-squared: {rsquared_second:.2f}')
print('Confusion Matrix:')
print(conf_matrix_second)
print('Classification Report:')
print(classification_rep_second)

# Combine outcomes of both models
combined_predictions = (y_pred + predictions_second) >= 1

# Evaluate the combined model
combined_accuracy = accuracy_score(y_test, combined_predictions)
combined_mse = mean_squared_error(y_test, combined_predictions)
combined_rsquared = r2_score(y_test, combined_predictions)
combined_conf_matrix = confusion_matrix(y_test, combined_predictions)
combined_classification_rep = classification_report(y_test, combined_predictions)

# Display results for the combined model
print(f'Combined Model Results:')
print(f'Accuracy: {combined_accuracy:.2f}')
print(f'Mean Squared Error: {combined_mse:.2f}')
print(f'R-squared: {combined_rsquared:.2f}')
print('Confusion Matrix:')
print(combined_conf_matrix)
print('Classification Report:')
print(combined_classification_rep)


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    4.4s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    9.9s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:   17.5s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.5s


First Model Results:
Accuracy: 0.97
Mean Squared Error: 0.03
R-squared: 0.87
Confusion Matrix:
[[2562    0]
 [ 118 1320]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      2562
           1       1.00      0.92      0.96      1438

    accuracy                           0.97      4000
   macro avg       0.98      0.96      0.97      4000
weighted avg       0.97      0.97      0.97      4000



[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.8s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    3.3s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    7.6s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:   13.6s
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 199 tasks      | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done 449 tasks      | elapsed:    0.5s
[Parallel(n_jobs=1)]: Done 799 tasks      | elapsed:    0.9s


Second Model Results:
Accuracy: 0.86
Mean Squared Error: 0.14
R-squared: 0.17
Confusion Matrix:
[[3010  141]
 [ 415  434]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.96      0.92      3151
           1       0.75      0.51      0.61       849

    accuracy                           0.86      4000
   macro avg       0.82      0.73      0.76      4000
weighted avg       0.85      0.86      0.85      4000

Combined Model Results:
Accuracy: 0.89
Mean Squared Error: 0.11
R-squared: 0.50
Confusion Matrix:
[[2208  354]
 [ 105 1333]]
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.86      0.91      2562
           1       0.79      0.93      0.85      1438

    accuracy                           0.89      4000
   macro avg       0.87      0.89      0.88      4000
weighted avg       0.90      0.89      0.89      4000



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load the first dataset
data = pd.read_csv('/kaggle/input/dataset/TransactionDataset1.csv')

# Drop unnecessary columns
drop_columns = ['user_id', 'name', 'addresses', 'email_address', 'transaction_id', 'transaction_date']
data = data.drop(drop_columns, axis=1)

# Convert categorical variables to numerical using Label Encoding
label_encoder = LabelEncoder()
for column in data.select_dtypes(include=['object']).columns:
    data[column] = label_encoder.fit_transform(data[column])

# Split the data into features (X) and target variable (y)
X = data.drop('fraud_indicator', axis=1)
y = data['fraud_indicator']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Choose a model (Feedforward Neural Network)
# Perform GridSearchCV for hyperparameter tuning
param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [500, 1000, 1500]
}

grid_search = GridSearchCV(MLPClassifier(random_state=42), param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters for the first model:", grid_search.best_params_)

# Apply best parameters to the first model
model = grid_search.best_estimator_

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rsquared = r2_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

# Display results for the first model
print(f'First Model Results:')
print(f'Accuracy: {accuracy:.2f}')
print(f'Mean Squared Error: {mse:.2f}')
print(f'R-squared: {rsquared:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(classification_rep)




In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

# Read the second dataset into df2
df2 = pd.read_csv('/kaggle/input/dataset/credit-debit dataset.csv')  

# Feature columns for the second dataset
features_second = ['Total Credit Amount', 'Transaction Amount']

# Additional feature columns for money sources and transaction accounts
money_sources = 'Money Sources'
transaction_accounts = 'Transfer Accounts'

# Target variable for the second dataset
target_second = 'Fraud Indicator'

# Drop rows with missing values
df2 = df2.dropna()

# Convert categorical variables to dummy/indicator variables
df2 = pd.get_dummies(df2, columns=['Employment Status', 'Education Level', 'Marital Status'], drop_first=True)

# Use MultiLabelBinarizer for one-hot encoding of accounts
mlb = MultiLabelBinarizer()

# Transform 'money_sources' and 'transaction_accounts' into binary features
money_sources_encoded = pd.DataFrame(mlb.fit_transform(df2['Money Sources']), columns=mlb.classes_, index=df2.index)
transaction_accounts_encoded = pd.DataFrame(mlb.fit_transform(df2['Transfer Accounts']), columns=mlb.classes_, index=df2.index)

# Concatenate the encoded features with the original DataFrame
df2 = pd.concat([df2, money_sources_encoded, transaction_accounts_encoded], axis=1)

# Feature columns for the second dataset including the new binary account features
features_second += list(money_sources_encoded.columns) + list(transaction_accounts_encoded.columns)

# Drop the original account columns
df2 = df2.drop(['Money Sources', 'Transfer Accounts'], axis=1)

# Split the data into training and testing sets
X_train_second, X_test_second, y_train_second, y_test_second = train_test_split(df2[features_second], df2[target_second], test_size=0.2, random_state=42)

# Perform GridSearchCV for hyperparameter tuning for the second model
param_grid_second = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50)],
    'alpha': [0.0001, 0.001, 0.01],
    'max_iter': [500, 1000, 1500]
}

grid_search_second = GridSearchCV(MLPClassifier(random_state=42), param_grid_second, cv=5)
grid_search_second.fit(X_train_second, y_train_second)

# Print best parameters
print("Best parameters for the second model:", grid_search_second.best_params_)

# Apply best parameters to the second model
model_second = grid_search_second.best_estimator_

# Train the second model
model_second.fit(X_train_second, y_train_second)

# Make predictions on the test set for the second model
predictions_second = model_second.predict(X_test_second)

# Evaluate the second model
accuracy_second = accuracy_score(y_test_second, predictions_second)
mse_second = mean_squared_error(y_test_second, predictions_second)
rsquared_second = r2_score(y_test_second, predictions_second)
conf_matrix_second = confusion_matrix(y_test_second, predictions_second)
classification_rep_second = classification_report(y_test_second, predictions_second)

# Display results for the second model
print(f'Second Model Results:')
print(f'Accuracy: {accuracy_second:.2f}')
print(f'Mean Squared Error: {mse_second:.2f}')
print(f'R-squared: {rsquared_second:.2f}')
print('Confusion Matrix:')
print(conf_matrix_second)
print('Classification Report:')
print(classification_rep_second)


In [None]:
# Combine outcomes of both models
combined_predictions = (y_pred + predictions_second) >= 1

# Evaluate the combined model
combined_accuracy = accuracy_score(y_test, combined_predictions)
combined_mse = mean_squared_error(y_test, combined_predictions)
combined_rsquared = r2_score(y_test, combined_predictions)
combined_conf_matrix = confusion_matrix(y_test, combined_predictions)
combined_classification_rep = classification_report(y_test, combined_predictions)

# Display results for the combined model
print(f'Combined Model Results:')
print(f'Accuracy: {combined_accuracy:.2f}')
print(f'Mean Squared Error: {combined_mse:.2f}')
print(f'R-squared: {combined_rsquared:.2f}')
print('Confusion Matrix:')
print(combined_conf_matrix)
print('Classification Report:')
print(combined_classification_rep)
