In [None]:
from src.data_loader import load_fraud_data, load_country_mapping
from src.geolocation import add_ip_integer_columns, merge_with_country
from src.feature_engineering import create_time_features, analyze_fraud_by_country
import pandas as pd
import sys
sys.path.append('..')

In [None]:
print("Loading data...")
fraud_df = load_fraud_data()
country_df = load_country_mapping()
print(f"Fraud data shape: {fraud_df.shape}")
print(f"Country mapping shape: {country_df.shape}")

Loading data...
Fraud data shape: (151112, 11)
Country mapping shape: (138846, 3)


In [None]:
print("\n STEP 1: GEOLOCATION INTEGRATION ")

print("Sample IP addresses from fraud data:")
print(fraud_df['ip_address'].head())

print("\nSample IP ranges from country mapping:")
print(country_df['lower_bound_ip_address'].head())
print(country_df['upper_bound_ip_address'].head())

print("\nConverting IP addresses to integers...")
fraud_df, country_df = add_ip_integer_columns(fraud_df, country_df)

print("\nMerging with country data...")
merged_df = merge_with_country(fraud_df, country_df)
print(f"After merging: {merged_df.shape[0]} transactions")
print(
    f"Success rate: {merged_df['country'].notna().sum()/merged_df.shape[0]*100:.1f}%")


print("\nSample of merged data with countries:")
sample_cols = ['user_id', 'ip_address', 'country', 'class']
print(merged_df[sample_cols].head(10))


=== STEP 1: GEOLOCATION INTEGRATION ===
Sample IP addresses from fraud data:
0    7.327584e+08
1    3.503114e+08
2    2.621474e+09
3    3.840542e+09
4    4.155831e+08
Name: ip_address, dtype: float64

Sample IP ranges from country mapping:
0    16777216.0
1    16777472.0
2    16777728.0
3    16778240.0
4    16779264.0
Name: lower_bound_ip_address, dtype: float64
0    16777471
1    16777727
2    16778239
3    16779263
4    16781311
Name: upper_bound_ip_address, dtype: int64

Converting IP addresses to integers...
Converting 151112 fraud IPs to integers...
Converting 138846 country mapping IPs to integers...

Merging with country data...
Valid fraud IPs: 0/151112
Valid country mappings: 0/138846
Error: No valid IPs to merge
After merging: 151112 transactions
Success rate: 0.0%

Sample of merged data with countries:
   user_id    ip_address country  class
0    22058  7.327584e+08    None      0
1   333320  3.503114e+08    None      0
2     1359  2.621474e+09    None      1
3   150084  3.

In [None]:
print("\n Fraud Analysis by Country ")
country_stats = analyze_fraud_by_country(merged_df)
print("\nTop 10 countries by fraud percentage:")
print(country_stats.head(10))

print("\nBottom 10 countries by fraud percentage:")
print(country_stats.tail(10))


=== Fraud Analysis by Country ===

Top 10 countries by fraud percentage:
Empty DataFrame
Columns: [country, total_transactions, fraud_count, fraud_percentage]
Index: []

Bottom 10 countries by fraud percentage:
Empty DataFrame
Columns: [country, total_transactions, fraud_count, fraud_percentage]
Index: []


In [None]:
print("\n STEP 2: FEATURE ENGINEERING ")
print("Creating time-based features...")
merged_df = create_time_features(merged_df)
print("\nNew features created:")
print("1. time_since_signup_hours")
print("2. hour_of_day")
print("3. day_of_week")

print("\nSample of new features:")
sample_cols = ['user_id', 'purchase_time', 'signup_time', 'time_since_signup_hours',
               'hour_of_day', 'day_of_week', 'country', 'class']
print(merged_df[sample_cols].head())


=== STEP 2: FEATURE ENGINEERING ===
Creating time-based features...

New features created:
1. time_since_signup_hours
2. hour_of_day
3. day_of_week

Sample of new features:
   user_id       purchase_time         signup_time  time_since_signup_hours  \
0    22058 2015-04-18 02:47:11 2015-02-24 22:55:49              1251.856111   
1   333320 2015-06-08 01:38:54 2015-06-07 20:39:50                 4.984444   
2     1359 2015-01-01 18:52:45 2015-01-01 18:52:44                 0.000278   
3   150084 2015-05-04 13:54:50 2015-04-28 21:13:25               136.690278   
4   221365 2015-09-09 18:40:53 2015-07-21 07:09:52              1211.516944   

   hour_of_day  day_of_week country  class  
0            2            5    None      0  
1            1            0    None      0  
2           18            3    None      1  
3           13            0    None      0  
4           18            2    None      0  


In [None]:
print("\n Analysis of time_since_signup ")
print(
    f"Minimum time since signup: {merged_df['time_since_signup_hours'].min():.2f} hours")
print(
    f"Maximum time since signup: {merged_df['time_since_signup_hours'].max():.2f} hours")
print(
    f"Average time since signup: {merged_df['time_since_signup_hours'].mean():.2f} hours")
print(
    f"\nFor legitimate transactions: {merged_df[merged_df['class'] == 0]['time_since_signup_hours'].mean():.2f} hours")
print(
    f"For fraud transactions: {merged_df[merged_df['class'] == 1]['time_since_signup_hours'].mean():.2f} hours")


=== Analysis of time_since_signup ===
Minimum time since signup: 0.00 hours
Maximum time since signup: 2879.99 hours
Average time since signup: 1370.01 hours

For legitimate transactions: 1441.99 hours
For fraud transactions: 673.29 hours


In [None]:
print("\n Saving Processed Data ")
merged_df.to_csv('../data/processed/fraud_data_with_features.csv', index=False)
print("Saved to: ../data/processed/fraud_data_with_features.csv")


=== Saving Processed Data ===
Saved to: ../data/processed/fraud_data_with_features.csv


In [None]:
print("\n FINAL CLASS IMBALANCE CHECK ")
fraud_count = merged_df['class'].sum()
total_count = len(merged_df)
fraud_percentage = fraud_count / total_count * 100

print(f"Total transactions: {total_count}")
print(f"Fraudulent transactions: {fraud_count}")
print(f"Legitimate transactions: {total_count - fraud_count}")
print(f"Fraud percentage: {fraud_percentage:.4f}%")
print(
    f"Imbalance ratio: 1 fraud per {int((total_count - fraud_count)/fraud_count)} legitimate transactions")


=== FINAL CLASS IMBALANCE CHECK ===
Total transactions: 151112
Fraudulent transactions: 14151
Legitimate transactions: 136961
Fraud percentage: 9.3646%
Imbalance ratio: 1 fraud per 9 legitimate transactions


In [None]:
print("\n=== STRATEGY FOR HANDLING CLASS IMBALANCE ===")
print("""
PROBLEM:
- Extremely imbalanced dataset (< 2% fraud)
- Standard accuracy metric would be misleading

PLAN FOR TASK 2:
1. Use SMOTE (Synthetic Minority Oversampling) ONLY on training data
2. Evaluate using AUC-PR (Precision-Recall curve) instead of accuracy
3. Use F1-score as main metric
4. Use Stratified K-Fold cross-validation

REASON FOR SMOTE:
- Creates synthetic fraud examples instead of duplicating
- Reduces overfitting risk
- More effective than simple oversampling for rare classes
""")