In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt

# File paths for Colab
train_file = "/content/fraudTrain.csv"
test_file = "/content/fraudTest.csv"

# Loading datasets
df = pd.concat(
    [
        pd.read_csv(train_file),
        pd.read_csv(test_file)
    ],
    ignore_index=True
)

# Removing unnecessary columns
if "Unnamed: 0" in df.columns:
    df.drop("Unnamed: 0", axis=1, inplace=True)

# Display basic information about the dataset
print(df.info())
print(df.head())

# Handling missing values (if any)
print("Missing values:", df.isnull().sum())

# Binarizing the 'gender' column
def gender_binarizer(x):
    if x == 'F':
        return 1
    elif x == 'M':
        return 0
    return np.nan  # Handle unexpected values

df['gender'] = df['gender'].apply(gender_binarizer)

# Dropping rows with any remaining NaN values
df.dropna(inplace=True)

# Defining features (X) and target (y)
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

# Scaling numerical features using RobustScaler
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X.select_dtypes(include=np.number))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Applying SMOTE to handle class imbalance
smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

# Training and evaluating models

# Logistic Regression
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_sm, y_train_sm)
y_pred_lr = lr.predict(X_test)
print("Logistic Regression Results:")
print(confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

# Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_sm, y_train_sm)
y_pred_dt = dt.predict(X_test)
print("Decision Tree Results:")
print(confusion_matrix(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_pred_dt))

# Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_sm, y_train_sm)
y_pred_rf = rf.predict(X_test)
print("Random Forest Results:")
print(confusion_matrix(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1392708 entries, 0 to 1392707
Data columns (total 22 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1392708 non-null  object 
 1   cc_num                 1392708 non-null  int64  
 2   merchant               1392707 non-null  object 
 3   category               1392707 non-null  object 
 4   amt                    1392707 non-null  float64
 5   first                  1392707 non-null  object 
 6   last                   1392707 non-null  object 
 7   gender                 1392707 non-null  object 
 8   street                 1392707 non-null  object 
 9   city                   1392707 non-null  object 
 10  state                  1392707 non-null  object 
 11  zip                    1392707 non-null  float64
 12  lat                    1392707 non-null  float64
 13  long                   1392707 non-null  float64
 14  city_pop          