# FRAUD DETECTION 

## Phase 1 -----(step 1.1[BASIC EDA ANALYSIS])

In [1]:
import pandas as pd
import pandas.api.types
import numpy as np
import warnings
from EDA_Detection.cleaning_func import reduce_mem_usage
import gc
from file_path import TRAIN_ID,TRAIN_TRANS

warnings.filterwarnings('ignore')

C:\Users\Adminn\fraud_detection\Files\train_identity.csv


### Define the columns we want to KEEP for the first run

In [4]:
trans_cols_to_keep = [
    'TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
    'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
    'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain', 'dist1', 'dist2'
]
# Add C, D, and M columns
trans_cols_to_keep.extend([f'C{i}' for i in range(1, 15)])
trans_cols_to_keep.extend([f'D{i}' for i in range(1, 16)])
trans_cols_to_keep.extend([f'M{i}' for i in range(1, 10)])

# Define dtypes for the columns we are keeping (using float32 for compatibility)
dtypes_trans_essentials = {
    'TransactionAmt': 'float32', 'isFraud': 'int8', 'TransactionDT': 'int32',
    'TransactionID': 'int32', 'ProductCD': 'category', 'card1': 'int16',
    'card2': 'float32', 'card3': 'float32', 'card4': 'category',
    'card5': 'float32', 'card6': 'category', 'addr1': 'float32',
    'addr2': 'float32', 'dist1': 'float32', 'dist2': 'float32',
    'P_emaildomain': 'category', 'R_emaildomain': 'category'
}
dtypes_trans_essentials.update({col: 'float32' for col in [f'C{i}' for i in range(1, 15)]})
dtypes_trans_essentials.update({col: 'float32' for col in [f'D{i}' for i in range(1, 16)]})
dtypes_trans_essentials.update({col: 'category' for col in [f'M{i}' for i in range(1, 10)]})


## --- Load and Optimize Essential Data ---

In [None]:
print("Loading train_transaction.csv (ESSENTIAL COLUMNS ONLY)...")
# Use 'usecols' to restrict columns loaded
train_trans = pd.read_csv(TRAIN_TRANS, dtype=dtypes_trans_essentials, usecols=trans_cols_to_keep)
train_trans = reduce_mem_usage(train_trans)  # Optimize memory AFTER loading

print("\nLoading train_identity.csv...")
# Identity file is smaller, load it fully and then optimize
train_id = pd.read_csv(TRAIN_ID, dtype={'TransactionID': 'int32'})
train_id = reduce_mem_usage(train_id)

print("\nMerging dataframes...")
train_data = train_trans.merge(train_id, on='TransactionID', how="left")

# Clean up memory
del train_trans, train_id
gc.collect()

Loading train_transaction.csv (ESSENTIAL COLUMNS ONLY)...


## EDA ANALYSIS

In [7]:
print("\n--- EDA RESULTS ---")
print("Dataset Shape:", train_data.shape)


--- EDA RESULTS ---
Dataset Shape: (590540, 95)


In [8]:
print("\nFirst few rows:")
print(train_data.head())



First few rows:
   TransactionID  isFraud  TransactionDT  TransactionAmt ProductCD  card1  \
0        2987000        0          86400            68.5         W  13926   
1        2987001        0          86401            29.0         W   2755   
2        2987002        0          86469            59.0         W   4663   
3        2987003        0          86499            50.0         W  18132   
4        2987004        0          86506            50.0         H   4497   

   card2  card3       card4  card5  ...                id_31  id_32  \
0    NaN  150.0    discover  142.0  ...                  NaN    NaN   
1  404.0  150.0  mastercard  102.0  ...                  NaN    NaN   
2  490.0  150.0        visa  166.0  ...                  NaN    NaN   
3  567.0  150.0  mastercard  117.0  ...                  NaN    NaN   
4  514.0  150.0  mastercard  102.0  ...  samsung browser 6.2   32.0   

       id_33           id_34  id_35 id_36 id_37  id_38  DeviceType  \
0        NaN           

###  Target Variable Distribution

In [9]:
print("\nFraud Distribution:")
print(train_data['isFraud'].value_counts())
print("\nFraud Percentage:")
print(train_data['isFraud'].value_counts(normalize=True) * 100)



Fraud Distribution:
isFraud
0    569877
1     20663
Name: count, dtype: int64

Fraud Percentage:
isFraud
0    96.500999
1     3.499001
Name: proportion, dtype: float64


### Missing Values Analysis

In [10]:
print("\nMissing Values (Top 20):")
missing = train_data.isnull().sum().sort_values(ascending=False).head(20)
print(missing)


Missing Values (Top 20):
id_24    585793
id_25    585408
id_07    585385
id_08    585385
id_21    585381
id_26    585377
id_22    585371
id_23    585371
id_27    585371
dist2    552913
D7       551623
id_18    545427
D13      528588
D14      528353
D12      525823
id_03    524216
id_04    524216
D6       517353
id_33    517251
D8       515614
dtype: int64


### Data Types

In [11]:
print("\nData Types Distribution:")
print(train_data.dtypes.value_counts())


Data Types Distribution:
float16     60
category    12
category     4
int32        2
category     1
category     1
category     1
category     1
category     1
category     1
category     1
category     1
category     1
int8         1
category     1
category     1
category     1
category     1
int16        1
category     1
category     1
Name: count, dtype: int64


### Basic Statistics

In [12]:
print("\nTransaction Amount Stats:")
print(train_data['TransactionAmt'].describe())


Transaction Amount Stats:
count    590540.000000
mean               NaN
std           0.000000
min           0.250977
25%          43.312500
50%          68.750000
75%         125.000000
max       31936.000000
Name: TransactionAmt, dtype: float64
