NAME: AVINASH

 CREDIT CARD FRAUD DETECTION:


 Build a model to detect fraudulent credit card transactions. Use a dataset containing information about credit card transactions, and experiment with algorithms like Logistic Regression, Decision Trees,
 or Random Forests to classify transactions as fraudulent or legitimate.


 DATA SET LINK: https://www.kaggle.com/datasets/kartik2112/fraud-detection

In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
import joblib
import seaborn as sns
import matplotlib.pyplot as plt

In [47]:
fraud_train = pd.read_csv('fraudTrain.csv')
fraud_test = pd.read_csv('fraudTest.csv')

In [49]:
print(fraud_train.columns)

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [50]:
fraud_train['trans_date_trans_time'] = pd.to_datetime(fraud_train['trans_date_trans_time'])
fraud_test['trans_date_trans_time'] = pd.to_datetime(fraud_test['trans_date_trans_time'])

In [51]:
numeric_cols_train = fraud_train.select_dtypes(include=['float64', 'int64']).columns
numeric_cols_test = fraud_test.select_dtypes(include=['float64', 'int64']).columns

In [52]:
fraud_train[numeric_cols_train] = fraud_train[numeric_cols_train].fillna(fraud_train[numeric_cols_train].median())
fraud_test[numeric_cols_test] = fraud_test[numeric_cols_test].fillna(fraud_test[numeric_cols_test].median())

In [53]:
fraud_train['transaction_year'] = fraud_train['trans_date_trans_time'].dt.year
fraud_train['transaction_month'] = fraud_train['trans_date_trans_time'].dt.month
fraud_train['transaction_day'] = fraud_train['trans_date_trans_time'].dt.day
fraud_train['transaction_hour'] = fraud_train['trans_date_trans_time'].dt.hour

fraud_test['transaction_year'] = fraud_test['trans_date_trans_time'].dt.year
fraud_test['transaction_month'] = fraud_test['trans_date_trans_time'].dt.month
fraud_test['transaction_day'] = fraud_test['trans_date_trans_time'].dt.day
fraud_test['transaction_hour'] = fraud_test['trans_date_trans_time'].dt.hour

In [54]:
fraud_train.drop('trans_date_trans_time', axis=1, inplace=True)
fraud_test.drop('trans_date_trans_time', axis=1, inplace=True)

In [57]:
print(fraud_train.columns)


Index(['Unnamed: 0', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last',
       'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop',
       'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long',
       'is_fraud', 'transaction_year', 'transaction_month', 'transaction_day',
       'transaction_hour'],
      dtype='object')


In [59]:
features = ['amt', 'city_pop', 'merchant', 'transaction_year', 'transaction_month', 'transaction_day', 'transaction_hour']
X_train = fraud_train[features]
y_train = fraud_train['is_fraud']

X_test = fraud_test[features]
y_test = fraud_test['is_fraud']

In [61]:
X_train = pd.get_dummies(X_train, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)

In [62]:
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [63]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [64]:
log_model = LogisticRegression(random_state=42)
log_model.fit(X_train, y_train)
y_pred_log = log_model.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [65]:
tree_model = DecisionTreeClassifier(random_state=42)
tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

In [66]:
forest_model = RandomForestClassifier(random_state=42)
forest_model.fit(X_train, y_train)
y_pred_forest = forest_model.predict(X_test)

In [67]:
print("Logistic Regression Model:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_log):.2f}')
print(classification_report(y_test, y_pred_log))

print("\nDecision Tree Model:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_tree):.2f}')
print(classification_report(y_test, y_pred_tree))

print("\nRandom Forest Model:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_forest):.2f}')
print(classification_report(y_test, y_pred_forest))


Logistic Regression Model:
Accuracy: 0.99
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     42599
         1.0       0.30      0.23      0.26       184

    accuracy                           0.99     42783
   macro avg       0.65      0.62      0.63     42783
weighted avg       0.99      0.99      0.99     42783


Decision Tree Model:
Accuracy: 0.99
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     42599
         1.0       0.30      0.37      0.33       184

    accuracy                           0.99     42783
   macro avg       0.65      0.68      0.66     42783
weighted avg       0.99      0.99      0.99     42783


Random Forest Model:
Accuracy: 1.00
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     42599
         1.0       0.74      0.29      0.42       184

    accuracy                           1.00     42783
   macro avg

In [68]:
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

forest_model_smote = RandomForestClassifier(random_state=42)
forest_model_smote.fit(X_train_smote, y_train_smote)
y_pred_forest_smote = forest_model_smote.predict(X_test)

print("\nRandom Forest Model with SMOTE:")
print(f'Accuracy: {accuracy_score(y_test, y_pred_forest_smote):.2f}')
print(classification_report(y_test, y_pred_forest_smote))



Random Forest Model with SMOTE:
Accuracy: 1.00
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     42599
         1.0       0.62      0.26      0.37       184

    accuracy                           1.00     42783
   macro avg       0.81      0.63      0.68     42783
weighted avg       1.00      1.00      1.00     42783



In [69]:
joblib.dump(forest_model, 'credit_card_fraud_model.pkl')


['credit_card_fraud_model.pkl']