In [2]:
df = pd.read_csv("/fraud-detection/fraudTrain.csv")

In [3]:
df["amt"]

0            4.97
1          107.23
2          220.11
3           45.00
4           41.96
            ...  
1296670     15.56
1296671     51.70
1296672    105.93
1296673     74.90
1296674      4.30
Name: amt, Length: 1296675, dtype: float64

In [4]:
numeric_stats = df.describe(include='all')
summary_table = pd.DataFrame({
    'Feature': df.columns,
    'Count': df.count().values,
    'Unique': numeric_stats.loc['unique'].values,
    'Freq': numeric_stats.loc['freq'].values,
    'Top': numeric_stats.loc['top'].values,
    'Mean': numeric_stats.loc['mean'].values,
    'Std Dev': numeric_stats.loc['std'].values,
    'Min': numeric_stats.loc['min'].values,
    '25%': numeric_stats.loc['25%'].values,
    'Median': numeric_stats.loc['50%'].values,
    '75%': numeric_stats.loc['75%'].values,
    'Max': numeric_stats.loc['max'].values
})


summary_table

Unnamed: 0,Feature,Count,Unique,Freq,Top,Mean,Std Dev,Min,25%,Median,75%,Max
0,Unnamed: 0,1296675,,,,648337.0,374317.974488,0.0,324168.5,648337.0,972505.5,1296674.0
1,trans_date_trans_time,1296675,1274791.0,4.0,2019-04-22 16:02:01,,,,,,,
2,cc_num,1296675,,,,4.1719204207972666e+17,1.3088064470002404e+18,60416207185.0,180042946491150.0,3521417320836166.0,4642255475285942.0,4.992346398065154e+18
3,merchant,1296675,693.0,4403.0,fraud_Kilback LLC,,,,,,,
4,category,1296675,14.0,131659.0,gas_transport,,,,,,,
5,amt,1296675,,,,70.351035,160.316039,1.0,9.65,47.52,83.14,28948.9
6,first,1296675,352.0,26669.0,Christopher,,,,,,,
7,last,1296675,481.0,28794.0,Smith,,,,,,,
8,gender,1296675,2.0,709863.0,F,,,,,,,
9,street,1296675,983.0,3123.0,0069 Robin Brooks Apt. 695,,,,,,,


In [5]:
columns_to_drop = [
    "Unnamed: 0",
    "cc_num", 
    "first",                 
    "last",                  
    "street",               
    "merchant",
    "trans_num",
    "unix_time",
    "zip"   
]


In [6]:
df.drop(columns=columns_to_drop,inplace=True)
print(df.shape)

(1296675, 14)


### Handeling Date and Time

In [7]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['trans_date'] = df['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
df['trans_date'] = pd.to_datetime(df['trans_date'])
df['dob'] = pd.to_datetime(df['dob'])

In [8]:
#Calculate Age
df["age"] = (df["trans_date"] - df["dob"]).dt.days

In [9]:
# Get Transaction Month & Year
df['trans_month'] = pd.DatetimeIndex(df['trans_date']).month
df['trans_year'] = pd.DatetimeIndex(df['trans_date']).year

In [10]:
#Calculate distance between merchant and home location
df['latitudinal_distance'] = abs(round(df['merch_lat'] - df['lat'],3))
df['longitudinal_distance'] = abs(round(df['merch_long'] - df['long'],3))

In [11]:
#Drop the columns after the feature engineering
drop_columns = ['trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date','state']
df.drop(columns=drop_columns,inplace=True)

### Columns To Cateogrise

In [12]:
df.gender=df.gender.apply(lambda x: 1 if x=="M" else 0)
df = pd.get_dummies(df, columns=['category'], prefix='category')

# Start Training

In [13]:
X_train = df.drop('is_fraud', axis=1)
y_train = df['is_fraud']

### Handling Data Imbalance

In [14]:
X_train.head(10)

Unnamed: 0,amt,gender,city_pop,age,trans_month,trans_year,latitudinal_distance,longitudinal_distance,category_entertainment,category_food_dining,...,category_grocery_pos,category_health_fitness,category_home,category_kids_pets,category_misc_net,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel
0,4.97,0,3495,11255,1,2019,0.068,0.87,False,False,...,False,False,False,False,True,False,False,False,False,False
1,107.23,0,149,14804,1,2019,0.271,0.024,False,False,...,True,False,False,False,False,False,False,False,False,False
2,220.11,1,4154,20801,1,2019,0.97,0.108,True,False,...,False,False,False,False,False,False,False,False,False,False
3,45.0,1,1939,18982,1,2019,0.804,0.447,False,False,...,False,False,False,False,False,False,False,False,False,False
4,41.96,1,99,11967,1,2019,0.254,0.83,False,False,...,False,False,False,False,False,True,False,False,False,False
5,94.63,0,2158,21015,1,2019,0.278,0.948,False,False,...,False,False,False,False,False,False,False,False,False,False
6,44.54,0,2691,9269,1,2019,0.83,0.836,False,False,...,False,False,False,False,False,False,False,False,False,False
7,71.65,1,6018,26066,1,2019,0.105,0.06,False,False,...,False,False,False,False,False,False,False,False,False,False
8,4.27,0,1472,28424,1,2019,0.016,0.297,False,False,...,False,False,False,False,False,True,False,False,False,False
9,198.39,0,151785,16350,1,2019,0.657,0.136,False,False,...,True,False,False,False,False,False,False,False,False,False


In [15]:
from imblearn.over_sampling import SMOTE

# Initialize the SMOTE object
smote = SMOTE(random_state=42)

# Apply SMOTE to the training data
X_train, y_train = smote.fit_resample(X_train, y_train)

In [16]:
from sklearn.preprocessing import StandardScaler

# Step 1: Fit the StandardScaler on the training data
scaler = StandardScaler()
scaler.fit(X_train)

# Step 2: Transform 
X_train = scaler.transform(X_train)

## Prepare Test Set

In [17]:
drop_columns = ['trans_date_trans_time','city','lat','long','job','dob','merch_lat','merch_long','trans_date','state']

test_df = pd.read_csv('/kaggle/input/fraud-detection/fraudTest.csv')

test_df['trans_date_trans_time'] = pd.to_datetime(test_df['trans_date_trans_time'])
test_df['trans_date'] = test_df['trans_date_trans_time'].dt.strftime('%Y-%m-%d')
test_df['trans_date'] = pd.to_datetime(test_df['trans_date'])
test_df['dob'] = pd.to_datetime(test_df['dob'])

test_df["age"] = (test_df["trans_date"] - test_df["dob"]).dt.days

test_df['latitudinal_distance'] = abs(round(test_df['merch_lat'] - test_df['lat'], 3))
test_df['longitudinal_distance'] = abs(round(test_df['merch_long'] - test_df['long'], 3))

test_df.drop(columns=drop_columns,inplace=True)
test_df.gender=test_df.gender.apply(lambda x: 1 if x=="M" else 0)
test_df = pd.get_dummies(test_df, columns=['category'], prefix='category')
test_df = test_df.reindex(columns=df.columns, fill_value=0)

X_test = test_df.drop('is_fraud', axis=1)
y_test = test_df['is_fraud']

X_test = scaler.transform(X_test)

## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and fit the Random Forest classifier on the training data
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = clf.predict(X_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")