In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("Credit Card/fraudTrain.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [4]:
data.columns

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')

### Removing unnecessary columns from the data

In [5]:
data.drop(columns=['Unnamed: 0', 'cc_num', 'merchant', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num', 'unix_time'],inplace=True)

In [6]:
data.head()

Unnamed: 0,trans_date_trans_time,category,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud
0,2019-01-01 00:00:18,misc_net,4.97,28654,36.0788,-81.1781,3495,36.011293,-82.048315,0
1,2019-01-01 00:00:44,grocery_pos,107.23,99160,48.8878,-118.2105,149,49.159047,-118.186462,0
2,2019-01-01 00:00:51,entertainment,220.11,83252,42.1808,-112.262,4154,43.150704,-112.154481,0
3,2019-01-01 00:01:16,gas_transport,45.0,59632,46.2306,-112.1138,1939,47.034331,-112.561071,0
4,2019-01-01 00:03:06,misc_pos,41.96,24433,38.4207,-79.4629,99,38.674999,-78.632459,0


### transforming the date and time columns to hour, day of week and is_weekend

In [7]:
data['trans_date_trans_time'] = pd.to_datetime(data['trans_date_trans_time'])

In [8]:
data['hour'] = data['trans_date_trans_time'].dt.hour
data['day_of_week'] = data['trans_date_trans_time'].dt.day_of_week
data['is_weekend'] = data['day_of_week'].apply(lambda x: 1 if x == 7 or x == 6 else 0)

In [9]:
data.drop('trans_date_trans_time', axis=1, inplace=True)

In [10]:
data.tail()

Unnamed: 0,category,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,hour,day_of_week,is_weekend
1296670,entertainment,15.56,84735,37.7175,-112.4777,258,36.841266,-111.690765,0,12,6,1
1296671,food_dining,51.7,21790,39.2667,-77.5101,100,38.906881,-78.246528,0,12,6,1
1296672,food_dining,105.93,88325,32.9396,-105.8189,899,33.619513,-105.130529,0,12,6,1
1296673,food_dining,74.9,57756,43.3526,-102.5411,1126,42.78894,-103.24116,0,12,6,1
1296674,food_dining,4.3,59871,45.8433,-113.8748,218,46.565983,-114.18611,0,12,6,1


### Encoding the category column to integer

In [11]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [12]:
data['category_encoded'] = le.fit_transform(data['category'])

In [13]:
data.head()

Unnamed: 0,category,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,hour,day_of_week,is_weekend,category_encoded
0,misc_net,4.97,28654,36.0788,-81.1781,3495,36.011293,-82.048315,0,0,1,0,8
1,grocery_pos,107.23,99160,48.8878,-118.2105,149,49.159047,-118.186462,0,0,1,0,4
2,entertainment,220.11,83252,42.1808,-112.262,4154,43.150704,-112.154481,0,0,1,0,0
3,gas_transport,45.0,59632,46.2306,-112.1138,1939,47.034331,-112.561071,0,0,1,0,2
4,misc_pos,41.96,24433,38.4207,-79.4629,99,38.674999,-78.632459,0,0,1,0,9


In [14]:
## seeing what is encoded to what
mapping_dict = dict(zip(data['category'],data['category_encoded'] ))
print(mapping_dict)

{'misc_net': 8, 'grocery_pos': 4, 'entertainment': 0, 'gas_transport': 2, 'misc_pos': 9, 'grocery_net': 3, 'shopping_net': 11, 'shopping_pos': 12, 'food_dining': 1, 'personal_care': 10, 'health_fitness': 5, 'travel': 13, 'kids_pets': 7, 'home': 6}


### transforming amount column

In [15]:
data['amount'] = np.log1p(data['amt'])

In [16]:
data.head()

Unnamed: 0,category,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,hour,day_of_week,is_weekend,category_encoded,amount
0,misc_net,4.97,28654,36.0788,-81.1781,3495,36.011293,-82.048315,0,0,1,0,8,1.786747
1,grocery_pos,107.23,99160,48.8878,-118.2105,149,49.159047,-118.186462,0,0,1,0,4,4.684259
2,entertainment,220.11,83252,42.1808,-112.262,4154,43.150704,-112.154481,0,0,1,0,0,5.39866
3,gas_transport,45.0,59632,46.2306,-112.1138,1939,47.034331,-112.561071,0,0,1,0,2,3.828641
4,misc_pos,41.96,24433,38.4207,-79.4629,99,38.674999,-78.632459,0,0,1,0,9,3.760269


### operating the latitude and longitude to find distance

In [17]:
def haversine_km(lat1, lon1, lat2, lon2):
    r = 6371  # Earth radius in KM
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return r * c

In [18]:
data['distance'] = haversine_km(data['lat'], data['long'],data['merch_lat'],data['merch_long'])

In [19]:
data.head()

Unnamed: 0,category,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,hour,day_of_week,is_weekend,category_encoded,amount,distance
0,misc_net,4.97,28654,36.0788,-81.1781,3495,36.011293,-82.048315,0,0,1,0,8,1.786747,78.597568
1,grocery_pos,107.23,99160,48.8878,-118.2105,149,49.159047,-118.186462,0,0,1,0,4,4.684259,30.212176
2,entertainment,220.11,83252,42.1808,-112.262,4154,43.150704,-112.154481,0,0,1,0,0,5.39866,108.206083
3,gas_transport,45.0,59632,46.2306,-112.1138,1939,47.034331,-112.561071,0,0,1,0,2,3.828641,95.673231
4,misc_pos,41.96,24433,38.4207,-79.4629,99,38.674999,-78.632459,0,0,1,0,9,3.760269,77.556744


In [20]:
df = data[['amount','distance','is_weekend','hour','day_of_week','category_encoded']]

In [21]:
df.head()

Unnamed: 0,amount,distance,is_weekend,hour,day_of_week,category_encoded
0,1.786747,78.597568,0,0,1,8
1,4.684259,30.212176,0,0,1,4
2,5.39866,108.206083,0,0,1,0
3,3.828641,95.673231,0,0,1,2
4,3.760269,77.556744,0,0,1,9


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 6 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   amount            1296675 non-null  float64
 1   distance          1296675 non-null  float64
 2   is_weekend        1296675 non-null  int64  
 3   hour              1296675 non-null  int32  
 4   day_of_week       1296675 non-null  int32  
 5   category_encoded  1296675 non-null  int64  
dtypes: float64(2), int32(2), int64(2)
memory usage: 49.5 MB


In [23]:
from sklearn.preprocessing import StandardScaler

X = data[['amount','distance','is_weekend','hour','day_of_week','category_encoded']]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## Isolation forest for anomaly detection

In [26]:
from sklearn.ensemble import IsolationForest

In [42]:
islForest = IsolationForest(random_state=0,contamination=0.02)

In [56]:
islForest.fit(X_scaled);

In [45]:
data["is_outlier"] = islForest.predict(X_scaled)
data['anomaly_score'] = islForest.decision_function(X_scaled)

In [46]:
data['is_outlier'] = data['is_outlier'].map({-1: 1, 1: 0})

In [47]:
data.head()

Unnamed: 0,category,amt,zip,lat,long,city_pop,merch_lat,merch_long,is_fraud,hour,day_of_week,is_weekend,category_encoded,amount,distance,is_outlier,anomaly_score
0,misc_net,4.97,28654,36.0788,-81.1781,3495,36.011293,-82.048315,0,0,1,0,8,1.786747,78.597568,0,0.072763
1,grocery_pos,107.23,99160,48.8878,-118.2105,149,49.159047,-118.186462,0,0,1,0,4,4.684259,30.212176,0,0.085859
2,entertainment,220.11,83252,42.1808,-112.262,4154,43.150704,-112.154481,0,0,1,0,0,5.39866,108.206083,0,0.025808
3,gas_transport,45.0,59632,46.2306,-112.1138,1939,47.034331,-112.561071,0,0,1,0,2,3.828641,95.673231,0,0.121097
4,misc_pos,41.96,24433,38.4207,-79.4629,99,38.674999,-78.632459,0,0,1,0,9,3.760269,77.556744,0,0.087855


In [48]:
print(data[['anomaly_score', 'is_outlier']].head())
print(data['is_outlier'].value_counts())


   anomaly_score  is_outlier
0       0.072763           0
1       0.085859           0
2       0.025808           0
3       0.121097           0
4       0.087855           0
is_outlier
0    1270741
1      25934
Name: count, dtype: int64


In [49]:
pd.crosstab(data['is_outlier'], data['is_fraud'], normalize='columns')

is_fraud,0,1
is_outlier,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.980899,0.825473
1,0.019101,0.174527


## Using Ensemble Learning = XGBoost as model

In [51]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [52]:
features = [
    'amount','distance','is_weekend',
    'hour','day_of_week','category_encoded',
    'anomaly_score','is_outlier'
]

X = data[features]
y = data['is_fraud']

In [53]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
fraud_ratio = (y_train == 0).sum() / (y_train == 1).sum()

In [54]:
model = XGBClassifier(
    n_estimators=400,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=fraud_ratio,
    eval_metric='auc',
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [55]:
from sklearn.metrics import classification_report, roc_auc_score
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]
print(classification_report(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    257834
           1       0.26      0.95      0.40      1501

    accuracy                           0.98    259335
   macro avg       0.63      0.97      0.70    259335
weighted avg       1.00      0.98      0.99    259335

ROC-AUC: 0.9968754085339562


## testing the new 5 lakhs rows of data

In [58]:
df = pd.read_csv("Credit Card/fraudTest.csv")

In [59]:
df.drop(columns=['Unnamed: 0', 'cc_num', 'merchant', 'first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob', 'trans_num', 'unix_time'],inplace=True)

## performing feature engineering

### modifying the date and time column

In [61]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour
df['day_of_week'] = df['trans_date_trans_time'].dt.day_of_week
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x == 7 or x == 6 else 0)
df.drop('trans_date_trans_time', axis=1, inplace=True)

### encoding the category column

In [62]:
df['category_encoded'] = le.fit_transform(df['category'])

### modifying the amount column

In [63]:
df['amount'] = np.log1p(df['amt'])

### calculating the distance the using lat and long

In [64]:
df['distance'] = haversine_km(df['lat'], df['long'],df['merch_lat'],df['merch_long'])

### standardizing the dataset for the anomaly detection

In [65]:
X_test = df[['amount','distance','is_weekend','hour','day_of_week','category_encoded']]
X_test_scaled = scaler.fit_transform(X_test)

In [66]:
islForest.fit(X_test_scaled);

df["is_outlier"] = islForest.predict(X_test_scaled)
df['anomaly_score'] = islForest.decision_function(X_test_scaled)
df['is_outlier'] = df['is_outlier'].map({-1: 1, 1: 0})

print(df[['anomaly_score', 'is_outlier']].head())
print(df['is_outlier'].value_counts())
pd.crosstab(df['is_outlier'], df['is_fraud'], normalize='columns')

   anomaly_score  is_outlier
0      -0.008297           1
1       0.069173           0
2       0.085715           0
3       0.038074           0
4       0.012473           0
is_outlier
0    544604
1     11115
Name: count, dtype: int64


is_fraud,0,1
is_outlier,Unnamed: 1_level_1,Unnamed: 2_level_1
0,0.980496,0.851748
1,0.019504,0.148252


### creating the dataset for model

In [67]:
features = [
    'amount','distance','is_weekend',
    'hour','day_of_week','category_encoded',
    'anomaly_score','is_outlier'
]
X = df[features]
y = df['is_fraud']

In [68]:
model.fit(X, y)

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


### evalauting the model performance

In [70]:
y_pred = model.predict(X)
y_prob = model.predict_proba(X)[:,1]
print(classification_report(y, y_pred))
print("ROC-AUC:", roc_auc_score(y, y_prob))

              precision    recall  f1-score   support

           0       1.00      0.99      0.99    553574
           1       0.26      1.00      0.41      2145

    accuracy                           0.99    555719
   macro avg       0.63      0.99      0.70    555719
weighted avg       1.00      0.99      0.99    555719

ROC-AUC: 0.9991856078133613


### testing some more data from GPT

In [71]:
normal_tests = [
    {"amount": 120.0,  "distance": 2.1,  "is_weekend": 0, "hour": 14, "day_of_week": 2, "category_encoded": 3},
    {"amount": 350.0,  "distance": 4.8,  "is_weekend": 0, "hour": 18, "day_of_week": 4, "category_encoded": 1},
    {"amount": 90.0,   "distance": 1.2,  "is_weekend": 1, "hour": 11, "day_of_week": 6, "category_encoded": 2},
    {"amount": 600.0,  "distance": 8.5,  "is_weekend": 1, "hour": 20, "day_of_week": 5, "category_encoded": 4},
]
fraud_tests = [
    {"amount": 85000.0, "distance": 320.0, "is_weekend": 0, "hour": 3,  "day_of_week": 1, "category_encoded": 7},
    {"amount": 120000.0,"distance": 750.0, "is_weekend": 1, "hour": 2,  "day_of_week": 6, "category_encoded": 9},
    {"amount": 45000.0, "distance": 1000.0,"is_weekend": 0, "hour": 1,  "day_of_week": 3, "category_encoded": 5},
    {"amount": 98000.0, "distance": 420.0, "is_weekend": 1, "hour": 4,  "day_of_week": 0, "category_encoded": 8},
]
test_df = pd.DataFrame(normal_tests + fraud_tests)

In [74]:
# same feature list used in Isolation Forest
iso_features = [
    'amount','distance','is_weekend',
    'hour','day_of_week','category_encoded'
]

# scale test data (VERY IMPORTANT)
test_scaled = scaler.transform(test_df[iso_features])

# compute anomaly features
test_df['anomaly_score'] = islForest.decision_function(test_scaled)
test_df['is_outlier'] = islForest.predict(test_scaled)
test_df['is_outlier'] = test_df['is_outlier'].map({-1: 1, 1: 0})


In [76]:
xgb_features = [
    'amount','distance','is_weekend',
    'hour','day_of_week','category_encoded',
    'anomaly_score','is_outlier'
]

test_df['fraud_probability'] = model.predict_proba(test_df[xgb_features])[:,1]
THRESHOLD = 0.8

test_df['prediction'] = (test_df['fraud_probability'] >= THRESHOLD).astype(int)

test_df['label'] = test_df['prediction'].map({
    0: '✅ Normal',
    1: '⚠️ Fraud'
})


test_df['label'] = test_df['prediction'].map({
    0: '✅ Normal',
    1: '⚠️ Fraud'
})

test_df[xgb_features + ['fraud_probability','label']]

Unnamed: 0,amount,distance,is_weekend,hour,day_of_week,category_encoded,anomaly_score,is_outlier,fraud_probability,label
0,120.0,2.1,0,14,2,3,-0.005186,1,0.980958,⚠️ Fraud
1,350.0,4.8,0,18,4,1,-0.019321,1,0.151695,✅ Normal
2,90.0,1.2,1,11,6,2,-0.061818,1,0.726595,✅ Normal
3,600.0,8.5,1,20,5,4,-0.055269,1,0.988545,⚠️ Fraud
4,85000.0,320.0,0,3,1,7,-0.025958,1,0.896291,⚠️ Fraud
5,120000.0,750.0,1,2,6,9,-0.076643,1,0.762631,✅ Normal
6,45000.0,1000.0,0,1,3,5,-0.02452,1,0.93676,⚠️ Fraud
7,98000.0,420.0,1,4,0,8,-0.066538,1,0.041027,✅ Normal
