In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
import joblib

In [2]:
df = pd.read_csv("creditcard.csv")

In [3]:
df.head()

Unnamed: 0,TransactionID,TransactionDate,Amount,MerchantID,TransactionType,Location,IsFraud
0,1,2024-04-03 14:15:35.462794,4189.27,688,refund,San Antonio,0
1,2,2024-03-19 13:20:35.462824,2659.71,109,refund,Dallas,0
2,3,2024-01-08 10:08:35.462834,784.0,394,purchase,New York,0
3,4,2024-04-13 23:50:35.462850,3514.4,944,purchase,Philadelphia,0
4,5,2024-07-12 18:51:35.462858,369.07,475,purchase,Phoenix,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   TransactionID    100000 non-null  int64  
 1   TransactionDate  100000 non-null  object 
 2   Amount           100000 non-null  float64
 3   MerchantID       100000 non-null  int64  
 4   TransactionType  100000 non-null  object 
 5   Location         100000 non-null  object 
 6   IsFraud          100000 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 5.3+ MB


In [5]:
df.describe()

Unnamed: 0,TransactionID,Amount,MerchantID,IsFraud
count,100000.0,100000.0,100000.0,100000.0
mean,50000.5,2497.092666,501.67607,0.01
std,28867.657797,1442.415999,288.715868,0.099499
min,1.0,1.05,1.0,0.0
25%,25000.75,1247.955,252.0,0.0
50%,50000.5,2496.5,503.0,0.0
75%,75000.25,3743.5925,753.0,0.0
max,100000.0,4999.77,1000.0,1.0


In [6]:
df.isnull().sum()

Unnamed: 0,0
TransactionID,0
TransactionDate,0
Amount,0
MerchantID,0
TransactionType,0
Location,0
IsFraud,0


In [7]:
df['IsFraud'].value_counts(normalize=True)

Unnamed: 0_level_0,proportion
IsFraud,Unnamed: 1_level_1
0,0.99
1,0.01


In [8]:
df['TransactionHour'] = pd.to_datetime(df['TransactionDate']).dt.hour

numeric_features = ['Amount', 'TransactionHour']
categorical_features = ['MerchantID', 'TransactionType', 'Location']

X = df.drop(['TransactionID', 'TransactionDate', 'IsFraud'], axis=1)
y = df['IsFraud']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

pipeline = Pipeline([
    ('preprocess', preprocessor),
    ('smote', SMOTE()),
    ('rf', RandomForestClassifier(n_estimators=150, random_state=42))
])

In [10]:
pipeline.fit(X_train, y_train)

In [11]:
preds = pipeline.predict(X_test)

classification_report(y_test, preds, output_dict=False)

confusion_matrix(y_test, preds)

array([[19774,    26],
       [  199,     1]])

In [12]:
joblib.dump(pipeline, "model.joblib")

['model.joblib']

In [13]:
sample = X_test.iloc[:5]
sample_preds = pipeline.predict(sample)
sample_probs = pipeline.predict_proba(sample)[:,1]

pd.DataFrame({
    "Sample Index": sample.index,
    "Prediction (1=Fraud)": sample_preds,
    "Fraud Probability": sample_probs
})

Unnamed: 0,Sample Index,Prediction (1=Fraud),Fraud Probability
0,38609,0,0.0
1,1321,0,0.0
2,19671,0,0.0
3,76220,0,0.006667
4,70146,0,0.006667
