In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [6]:
from google.colab import drive
drive.mount('/content/drive')

# Set project directories
BASE_DIR = "/content/drive/MyDrive/fraud_project"
DATA_DIR = BASE_DIR + "/data"
OUTPUT_DIR = BASE_DIR + "/outputs"

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Data directory:", DATA_DIR)
print("Output directory:", OUTPUT_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data directory: /content/drive/MyDrive/fraud_project/data
Output directory: /content/drive/MyDrive/fraud_project/outputs


In [8]:
!pip install -q pandas numpy scikit-learn matplotlib joblib pyarrow


In [9]:
import pandas as pd

tx_path = DATA_DIR + "/Transaction.csv"
cust_path = DATA_DIR + "/Customer.csv"
call_path = DATA_DIR + "/Call_history.csv"

transactions = pd.read_csv(tx_path)
customers = pd.read_csv(cust_path)
calls = pd.read_csv(call_path)

print("Transactions:", transactions.shape)
print("Customers:", customers.shape)
print("Call History:", calls.shape)

transactions.head()


Transactions: (100000, 16)
Customers: (10127, 23)
Call History: (101174, 17)


Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1


In [10]:
def quick_eda(df, name):
    print(f"\n===== {name} =====")
    print(df.shape)
    print(df.head())
    print("\nMissing values:\n", df.isna().sum())
    print("\nData types:\n", df.dtypes)

quick_eda(transactions, "Transactions")
quick_eda(customers, "Customers")
quick_eda(calls, "Call History")



===== Transactions =====
(100000, 16)
  Transaction ID       Date Day of Week  Time Type of Card Entry Mode Amount  \
0      #3577 209  14-Oct-20   Wednesday    19         Visa        Tap     £5   
1      #3039 221  14-Oct-20   Wednesday    17   MasterCard        PIN   £288   
2      #2694 780  14-Oct-20   Wednesday    14         Visa        Tap     £5   
3      #2640 960  13-Oct-20     Tuesday    14         Visa        Tap    £28   
4      #2771 031  13-Oct-20     Tuesday    23         Visa        CVC    £91   

  Type of Transaction Merchant Group Country of Transaction Shipping Address  \
0                 POS  Entertainment         United Kingdom   United Kingdom   
1                 POS       Services                    USA              USA   
2                 POS     Restaurant                  India            India   
3                 POS  Entertainment         United Kingdom            India   
4              Online    Electronics                    USA              USA   


In [13]:
import numpy as np

def preprocess(transactions, customers, calls):
    tx = transactions.copy()
    cust = customers.copy()
    call = calls.copy()

    # Convert timestamps
    if 'Date' in tx.columns:
        tx["Date"] = pd.to_datetime(tx["Date"], format='%d-%b-%y', errors="coerce")
    # No obvious datetime columns in calls or customers dataframes based on quick_eda output.
    # Removed original lines for call["CALL_TIME"] and cust["JOINING_DT"]

    # Normalize text columns for transactions
    text_cols_tx = ["Type of Card", "Entry Mode", "Type of Transaction", "Merchant Group",
                    "Country of Transaction", "Shipping Address", "Country of Residence",
                    "Gender", "Bank"]
    for col in text_cols_tx:
        if col in tx.columns:
            tx[col] = tx[col].astype(str).str.lower().str.strip()

    # Amount cleaning and missing flag for transactions
    if "Amount" in tx.columns:
        tx["Amount_missing"] = tx["Amount"].isna().astype(int)
        # Remove '£' and convert to numeric, then fill NaNs
        tx["Amount"] = tx["Amount"].astype(str).str.replace('£', '', regex=False)
        tx["Amount"] = pd.to_numeric(tx["Amount"], errors='coerce').fillna(0)

    # Customer missing fill: quick_eda shows no missing values, so this line is not strictly needed
    # and could cause issues if numeric columns unexpectedly had NaNs and were filled with a string.
    # cust = cust.fillna("unknown")

    return tx, cust, call

transactions, customers, calls = preprocess(transactions, customers, calls)

transactions.head()

Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud,Amount_missing
0,#3577 209,2020-10-14,Wednesday,19,visa,tap,5.0,pos,entertainment,united kingdom,united kingdom,united kingdom,m,25.2,rbs,0,0
1,#3039 221,2020-10-14,Wednesday,17,mastercard,pin,288.0,pos,services,usa,usa,usa,f,49.6,lloyds,0,0
2,#2694 780,2020-10-14,Wednesday,14,visa,tap,5.0,pos,restaurant,india,india,india,f,42.2,barclays,0,0
3,#2640 960,2020-10-13,Tuesday,14,visa,tap,28.0,pos,entertainment,united kingdom,india,united kingdom,f,51.0,barclays,0,0
4,#2771 031,2020-10-13,Tuesday,23,visa,cvc,91.0,online,electronics,usa,usa,united kingdom,m,38.0,halifax,1,0


In [16]:
print("Transaction columns:\n", transactions.columns.tolist())
print("\nCustomer columns:\n", customers.columns.tolist())
print("\nCall History columns:\n", calls.columns.tolist())



Transaction columns:
 ['Transaction ID', 'Date', 'Day of Week', 'Time', 'Type of Card', 'Entry Mode', 'Amount', 'Type of Transaction', 'Merchant Group', 'Country of Transaction', 'Shipping Address', 'Country of Residence', 'Gender', 'Age', 'Bank', 'Fraud', 'Amount_missing']

Customer columns:
 ['CLIENTNUM', 'Attrition_Flag', 'Customer_Age', 'Gender', 'Dependent_count', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category', 'Months_on_book', 'Total_Relationship_Count', 'Months_Inactive_12_mon', 'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal', 'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2']

Call History columns:


In [17]:
from google.colab import drive
drive.mount('/content/drive')

# Set your project directory
BASE_DIR = "/content/drive/MyDrive/fraud_project"
DATA_DIR = BASE_DIR + "/data"
OUTPUT_DIR = BASE_DIR + "/outputs"

import os
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Data folder:", DATA_DIR)
print("Output folder:", OUTPUT_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Data folder: /content/drive/MyDrive/fraud_project/data
Output folder: /content/drive/MyDrive/fraud_project/outputs


In [18]:
!pip install -q pandas numpy scikit-learn matplotlib joblib pyarrow


In [19]:
import pandas as pd

tx_path = DATA_DIR + "/Transaction.csv"

transactions = pd.read_csv(tx_path)

print("Shape:", transactions.shape)
transactions.head()


Shape: (100000, 16)


Unnamed: 0,Transaction ID,Date,Day of Week,Time,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud
0,#3577 209,14-Oct-20,Wednesday,19,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0
1,#3039 221,14-Oct-20,Wednesday,17,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0
2,#2694 780,14-Oct-20,Wednesday,14,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0
3,#2640 960,13-Oct-20,Tuesday,14,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0
4,#2771 031,13-Oct-20,Tuesday,23,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1


In [20]:
print("Columns:", transactions.columns.tolist())
print("\nMissing values:\n", transactions.isna().sum())
print("\nData types:\n", transactions.dtypes)

transactions.describe()


Columns: ['Transaction ID', 'Date', 'Day of Week', 'Time', 'Type of Card', 'Entry Mode', 'Amount', 'Type of Transaction', 'Merchant Group', 'Country of Transaction', 'Shipping Address', 'Country of Residence', 'Gender', 'Age', 'Bank', 'Fraud']

Missing values:
 Transaction ID             0
Date                       0
Day of Week                0
Time                       0
Type of Card               0
Entry Mode                 0
Amount                     6
Type of Transaction        0
Merchant Group            10
Country of Transaction     0
Shipping Address           5
Country of Residence       0
Gender                     4
Age                        0
Bank                       0
Fraud                      0
dtype: int64

Data types:
 Transaction ID             object
Date                       object
Day of Week                object
Time                        int64
Type of Card               object
Entry Mode                 object
Amount                     object
Type of T

Unnamed: 0,Time,Age,Fraud
count,100000.0,100000.0,100000.0
mean,14.56287,44.99377,0.07195
std,5.308195,9.948494,0.258406
min,0.0,15.0,0.0
25%,10.0,38.2,0.0
50%,15.0,44.9,0.0
75%,19.0,51.7,0.0
max,24.0,86.1,1.0


In [21]:
import pandas as pd

df = transactions.copy()

df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
df["DateTime"] = df["Date"].astype(str) + " " + df["Time"].astype(str)
df["DateTime"] = pd.to_datetime(df["DateTime"], errors="coerce")

df = df.drop(columns=["Date", "Time"])
df.head()

  df["Date"] = pd.to_datetime(df["Date"], errors="coerce")


Unnamed: 0,Transaction ID,Day of Week,Type of Card,Entry Mode,Amount,Type of Transaction,Merchant Group,Country of Transaction,Shipping Address,Country of Residence,Gender,Age,Bank,Fraud,DateTime
0,#3577 209,Wednesday,Visa,Tap,£5,POS,Entertainment,United Kingdom,United Kingdom,United Kingdom,M,25.2,RBS,0,2020-10-14 19:00:00
1,#3039 221,Wednesday,MasterCard,PIN,£288,POS,Services,USA,USA,USA,F,49.6,Lloyds,0,2020-10-14 17:00:00
2,#2694 780,Wednesday,Visa,Tap,£5,POS,Restaurant,India,India,India,F,42.2,Barclays,0,2020-10-14 14:00:00
3,#2640 960,Tuesday,Visa,Tap,£28,POS,Entertainment,United Kingdom,India,United Kingdom,F,51.0,Barclays,0,2020-10-13 14:00:00
4,#2771 031,Tuesday,Visa,CVC,£91,Online,Electronics,USA,USA,United Kingdom,M,38.0,Halifax,1,2020-10-13 23:00:00


In [23]:
df["Hour"] = df["DateTime"].dt.hour
df["Weekday"] = df["DateTime"].dt.weekday
df["IsNight"] = df["Hour"].isin([0,1,2,3,4,5,6]).astype(int)


In [35]:
import numpy as np

# Ensure 'Amount' column is numeric before applying log1p
# This re-applies the cleaning steps from the preprocess function to df's Amount column
if "Amount" in df.columns:
    df["Amount"] = df["Amount"].astype(str).str.replace('£', '', regex=False)
    df["Amount"] = pd.to_numeric(df["Amount"], errors='coerce').fillna(0)

df["LogAmount"] = np.log1p(df["Amount"])

In [37]:
categorical_cols = [
    "Type of Card", "Entry Mode", "Type of Transaction",
    "Merchant Group", "Country of Transaction", "Bank",
    "Day of Week", "Country of Residence", "Gender"
]

df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
df.head()

KeyError: "['Type of Card', 'Entry Mode', 'Type of Transaction', 'Merchant Group', 'Country of Transaction', 'Bank'] not in index"

In [36]:
df = df.sort_values("DateTime")

# Drop rows where 'DateTime' is NaT before rolling window calculation
df.dropna(subset=['DateTime'], inplace=True)

df["TxnsLast1H"]  = df.rolling("1h", on="DateTime")["Amount"].count().shift(1).fillna(0)
df["TxnsLast24H"] = df.rolling("24h", on="DateTime")["Amount"].count().shift(1).fillna(0)

df["AmtLast1H"]  = df.rolling("1h", on="DateTime")["Amount"].sum().shift(1).fillna(0)
df["AmtLast24H"] = df.rolling("24h", on="DateTime")["Amount"].sum().shift(1).fillna(0)

df.head()

Unnamed: 0,Transaction ID,Day of Week,Amount,Shipping Address,Country of Residence,Gender,Age,Fraud,DateTime,Hour,...,Bank_HSBC,Bank_Halifax,Bank_Lloyds,Bank_Metro,Bank_Monzo,Bank_RBS,TxnsLast1H,TxnsLast24H,AmtLast1H,AmtLast24H
57097,#2650 123,Tuesday,210.0,USA,United Kingdom,M,56.5,1,2020-10-13,0.0,...,False,False,False,False,False,False,0.0,0.0,0.0,0.0
78458,#3233 260,Tuesday,8.0,China,United Kingdom,F,54.4,1,2020-10-13,0.0,...,False,False,False,False,False,False,1.0,1.0,210.0,210.0
6392,#3201 392,Tuesday,22.0,Russia,United Kingdom,F,26.5,1,2020-10-13,0.0,...,False,False,False,True,False,False,2.0,2.0,218.0,218.0
556,#3064 153,Tuesday,12.0,USA,United Kingdom,M,39.0,1,2020-10-13,0.0,...,True,False,False,False,False,False,3.0,3.0,240.0,240.0
60721,#2659 522,Tuesday,21.0,USA,United Kingdom,M,54.4,1,2020-10-13,0.0,...,False,False,False,False,False,False,4.0,4.0,252.0,252.0


In [31]:
target = "Fraud"

X = df.drop(columns=[target, "DateTime", "Transaction ID", "Shipping Address"])
y = df[target]

X.shape, y.shape


((99999, 38), (99999,))

In [32]:
cutoff = df["DateTime"].quantile(0.8)

train = df[df["DateTime"] < cutoff]
test = df[df["DateTime"] >= cutoff]

X_train = train.drop(columns=[target, "DateTime", "Transaction ID", "Shipping Address"])
y_train = train[target]

X_test = test.drop(columns=[target, "DateTime", "Transaction ID", "Shipping Address"])
y_test = test[target]

len(train), len(test)


(77433, 22566)

In [39]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    class_weight="balanced"
)

model.fit(X_train, y_train)


ValueError: could not convert string to float: 'Tuesday'

In [40]:
X_train.dtypes


Unnamed: 0,0
Day of Week,object
Amount,float64
Country of Residence,object
Gender,object
Age,float64
Hour,float64
Weekday,float64
IsNight,int64
LogAmount,float64
Type of Card_Visa,bool


In [41]:
# Drop columns that cannot be used directly
df_model = df.drop(columns=["Transaction ID", "DateTime", "Shipping Address"])

# Convert all text/object columns to numeric dummies
df_model = pd.get_dummies(df_model, drop_first=True)

df_model.head()


Unnamed: 0,Amount,Age,Fraud,Hour,Weekday,IsNight,LogAmount,Type of Card_Visa,Entry Mode_PIN,Entry Mode_Tap,...,AmtLast1H,AmtLast24H,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday,Country of Residence_India,Country of Residence_Russia,Country of Residence_USA,Country of Residence_United Kingdom,Gender_M
57097,210.0,56.5,1,0.0,1.0,1,5.351858,True,False,False,...,0.0,0.0,False,True,False,False,False,False,True,True
78458,8.0,54.4,1,0.0,1.0,1,2.197225,True,True,False,...,210.0,210.0,False,True,False,False,False,False,True,False
6392,22.0,26.5,1,0.0,1.0,1,3.135494,True,False,True,...,218.0,218.0,False,True,False,False,False,False,True,False
556,12.0,39.0,1,0.0,1.0,1,2.564949,True,False,True,...,240.0,240.0,False,True,False,False,False,False,True,True
60721,21.0,54.4,1,0.0,1.0,1,3.091042,True,False,False,...,252.0,252.0,False,True,False,False,False,False,True,True


In [42]:
from sklearn.metrics import classification_report, confusion_matrix

pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, pred))

print("\nClassification Report:")
print(classification_report(y_test, pred, digits=4))


AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

In [44]:
# Make a clean copy
df_clean = df.copy()

# Drop columns that cannot be used directly
df_clean = df_clean.drop(columns=["Transaction ID", "DateTime", "Shipping Address"], errors='ignore')

# Convert ALL object (string) columns to numeric dummies
df_model = pd.get_dummies(df_clean, drop_first=True)

print("All columns are now numeric:", df_model.dtypes.unique())
df_model.head()


All columns are now numeric: [dtype('float64') dtype('int64') dtype('bool')]


Unnamed: 0,Amount,Age,Fraud,Hour,Weekday,IsNight,LogAmount,Type of Card_Visa,Entry Mode_PIN,Entry Mode_Tap,...,AmtLast1H,AmtLast24H,Day of Week_Thursday,Day of Week_Tuesday,Day of Week_Wednesday,Country of Residence_India,Country of Residence_Russia,Country of Residence_USA,Country of Residence_United Kingdom,Gender_M
57097,210.0,56.5,1,0.0,1.0,1,5.351858,True,False,False,...,0.0,0.0,False,True,False,False,False,False,True,True
78458,8.0,54.4,1,0.0,1.0,1,2.197225,True,True,False,...,210.0,210.0,False,True,False,False,False,False,True,False
6392,22.0,26.5,1,0.0,1.0,1,3.135494,True,False,True,...,218.0,218.0,False,True,False,False,False,False,True,False
556,12.0,39.0,1,0.0,1.0,1,2.564949,True,False,True,...,240.0,240.0,False,True,False,False,False,False,True,True
60721,21.0,54.4,1,0.0,1.0,1,3.091042,True,False,False,...,252.0,252.0,False,True,False,False,False,False,True,True


In [45]:
cutoff = df["DateTime"].quantile(0.8)

train_idx = df[df["DateTime"] < cutoff].index
test_idx  = df[df["DateTime"] >= cutoff].index

X_train = df_model.loc[train_idx]
y_train = df.loc[train_idx, "Fraud"]

X_test = df_model.loc[test_idx]
y_test = df.loc[test_idx, "Fraud"]

# Final check
print("Any NaN?", X_train.isna().sum().sum())
print("Object types?", X_train.dtypes[X_train.dtypes=='object'])


Any NaN? 0
Object types? Series([], dtype: object)


In [46]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(
    n_estimators=200,
    class_weight="balanced",
    random_state=42
)

model.fit(X_train, y_train)
print("Model trained successfully!")


Model trained successfully!


In [47]:
has_trees = hasattr(model, "estimators_")
print("Model trained?", has_trees)


Model trained? True


In [48]:
from sklearn.metrics import classification_report, confusion_matrix

pred = model.predict(X_test)

print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))


[[21753     0]
 [    0   813]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     21753
           1       1.00      1.00      1.00       813

    accuracy                           1.00     22566
   macro avg       1.00      1.00      1.00     22566
weighted avg       1.00      1.00      1.00     22566



In [49]:
from sklearn.metrics import classification_report, confusion_matrix

pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, pred))

print("\nClassification Report:")
print(classification_report(y_test, pred, digits=4))


Confusion Matrix:
[[21753     0]
 [    0   813]]

Classification Report:
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000     21753
           1     1.0000    1.0000    1.0000       813

    accuracy                         1.0000     22566
   macro avg     1.0000    1.0000    1.0000     22566
weighted avg     1.0000    1.0000    1.0000     22566



In [50]:
proba = model.predict_proba(X_test)[:,1]
results = test.copy()
results["Predicted_Prob"] = proba
results_sorted = results.sort_values("Predicted_Prob", ascending=False)
results_sorted.head()


Unnamed: 0,Transaction ID,Day of Week,Amount,Shipping Address,Country of Residence,Gender,Age,Fraud,DateTime,Hour,...,Bank_Halifax,Bank_Lloyds,Bank_Metro,Bank_Monzo,Bank_RBS,TxnsLast1H,TxnsLast24H,AmtLast1H,AmtLast24H,Predicted_Prob
81355,#3317 361,Wednesday,298.0,United Kingdom,United Kingdom,F,49.8,0,2020-10-14 18:00:00,18.0,...,False,True,False,False,False,1460.0,49060.0,168275.0,5552974.0,1.0
79678,#2757 406,Wednesday,191.0,India,India,M,43.9,0,2020-10-14 16:00:00,16.0,...,False,False,False,False,False,481.0,47852.0,57231.0,5406888.0,1.0
23761,#2765 578,Wednesday,82.0,United Kingdom,United Kingdom,M,58.4,0,2020-10-14 20:00:00,20.0,...,True,False,False,False,False,1008.0,48533.0,114726.0,5506092.0,1.0
18263,#3396 772,Wednesday,22.0,United Kingdom,United Kingdom,F,51.0,0,2020-10-14 18:00:00,18.0,...,False,False,False,False,False,1364.0,48964.0,159179.0,5543878.0,1.0
23120,#3394 443,Wednesday,165.0,United Kingdom,United Kingdom,F,45.2,0,2020-10-14 23:00:00,23.0,...,False,False,False,False,True,595.0,47904.0,65550.0,5438253.0,1.0


In [51]:
import joblib

joblib.dump(model, OUTPUT_DIR + "/fraud_model.joblib")
df_model.to_parquet(OUTPUT_DIR + "/final_features.parquet", index=False)

print("Model and features saved!")


Model and features saved!


In [52]:
results_sorted.to_csv(OUTPUT_DIR + "/fraud_predictions.csv", index=False)
