In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, roc_auc_score, ConfusionMatrixDisplay


In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
train_path = "/content/drive/My Drive/Machine/fraudTrain.csv"
test_path = "/content/drive/My Drive/Machine/fraudTest.csv"

In [None]:
data1= pd.read_csv(train_path)

In [None]:
data=pd.read_csv(test_path)

In [None]:
data1

In [None]:
data1.shape

In [None]:
data1.columns

In [None]:
data1.info()

In [None]:
data1.describe()

In [None]:
data1.isnull().sum()

In [None]:
columns_to_drop = ['trans_date_trans_time', 'cc_num', 'merchant', 'first', 'last', 'street', 'city', 'state', 'zip', 'lat', 'long', 'job', 'dob', 'unix_time', 'merch_lat', 'merch_long']
data1 = data1.drop(columns=columns_to_drop, axis=1)


In [None]:
encoder = LabelEncoder()
data['category'] = encoder.fit_transform(data['category'])

In [None]:
fraud_by_gender = data1.groupby('gender')['is_fraud'].agg(['sum', 'count'])
fraud_by_gender['fraud_rate'] = fraud_by_gender['sum'] / fraud_by_gender['count']

print(fraud_by_gender)

plt.figure(figsize=(4,3))
plt.bar(fraud_by_gender.index, fraud_by_gender['fraud_rate'])
plt.xlabel('Gender')
plt.ylabel('Fraud Rate')
plt.title('Fraud Rate by Gender')
plt.show()


In [None]:
X=data1.drop('is_fraud',axis=1)
y=data1['is_fraud']

In [None]:
data1 = data1.dropna(subset=['is_fraud'])


In [None]:
non_numeric_cols = X.select_dtypes(include=['object']).columns
print("Non-numeric columns:", non_numeric_cols)
for col in non_numeric_cols:
    X[col] = encoder.fit_transform(X[col])
numerical_features = X.select_dtypes(include=np.number).columns
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X[numerical_features])
X_scaled = pd.DataFrame(X_scaled, columns=numerical_features, index=X.index)
X = pd.concat([X_scaled, X.drop(columns=numerical_features)], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [None]:
logistic_model = LogisticRegression(random_state=42, max_iter=1000)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
y_prob_logistic = logistic_model.predict_proba(X_test)[:, 1]

print("\nLogistic Regression:")
print("Classification Report:\n", classification_report(y_test, y_pred_logistic))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_logistic))
print("Confusion Matrix:")
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_logistic, cmap='Blues')
plt.show()


In [None]:
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train, y_train)
y_pred_tree = decision_tree_model.predict(X_test)
y_prob_tree = decision_tree_model.predict_proba(X_test)[:, 1]

print("\nDecision Tree Classifier:")
print("Classification Report:\n", classification_report(y_test, y_pred_tree))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob_tree))
print("Confusion Matrix:")
ConfusionMatrixDisplay.from_predictions(y_test, y_pred_tree, cmap='Blues')
plt.show()


In [None]:
model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
model_1.fit(X_train, y_train)
y_pred = model_1.predict(X_test)
y_prob = model_1.predict_proba(X_test)[:, 1]

print("\nRandom Forest Classifier:")
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC AUC Score:", roc_auc_score(y_test, y_prob))
print("Confusion Matrix:")
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, cmap='Blues')
plt.show()