In [None]:
# Load IP to Country mapping
ip_df = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
print(f"IP mapping shape: {ip_df.shape}")

# Convert IP to integer for range lookup
def ip_to_int(ip):
    try:
        return int(ipaddress.ip_address(ip))
    except:
        return None

df['ip_int'] = df['ipaddress'].apply(ip_to_int)

# Function to find country from IP range
def find_country(ip_int):
    if pd.isna(ip_int):
        return 'Unknown'
    country = ip_df[(ip_df['lower_bound_ip_address'] <= ip_int) & 
                    (ip_df['upper_bound_ip_address'] >= ip_int)]
    if not country.empty:
        return country.iloc[0]['country']
    return 'Unknown'

df['country'] = df['ip_int'].apply(find_country)
print(f"Countries mapped: {df['country'].nunique()}")

# Analyze fraud by country
country_fraud = df.groupby('country')['class'].agg(['count', 'mean', 'sum'])
country_fraud = country_fraud.sort_values('mean', ascending=False)
print("\nTop 10 High Fraud Rate Countries:")
print(country_fraud.head(10))

# Visualization
plt.figure(figsize=(12, 6))
top_20 = country_fraud.head(20)
plt.barh(top_20.index, top_20['mean']*100, color='crimson')
plt.xlabel('Fraud Rate (%)')
plt.title('Top 20 Countries by Fraud Rate')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../reports/fraud_by_country.png', dpi=300)
plt.show()

In [None]:
# 1. Time-based features
df['hour_of_day'] = df['purchasetime'].dt.hour
df['day_of_week'] = df['purchasetime'].dt.dayofweek
df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

# 2. Time since signup (CRITICAL FEATURE)
df['time_since_signup'] = (df['purchasetime'] - df['signuptime']).dt.total_seconds() / 3600  # in hours

# 3. Transaction frequency features
# Sort by user and time
df = df.sort_values(['userid', 'purchasetime'])

# Calculate time since last transaction per user
df['time_since_last_txn'] = df.groupby('userid')['purchasetime'].diff().dt.total_seconds() / 60  # minutes

# Transaction count in last 24h, 7d (using expanding window)
df['txn_count_24h'] = df.groupby('userid')['purchasetime'].transform(
    lambda x: x.rolling('24h', on=x).count()
)
df['txn_count_7d'] = df.groupby('userid')['purchasetime'].transform(
    lambda x: x.rolling('7d', on=x).count()
)

# 4. Velocity features
df['purchase_velocity_1h'] = df.groupby('userid')['purchasevalue'].transform(
    lambda x: x.rolling('1h', on=df.loc[x.index, 'purchasetime']).sum()
)
df['avg_purchase_user'] = df.groupby('userid')['purchasevalue'].transform('mean')

# 5. Device usage features
df['unique_devices_per_user'] = df.groupby('userid')['deviceid'].transform('nunique')

print(f"Total features created: {len(df.columns)}")
print(f"New features: hour_of_day, day_of_week, is_weekend, time_since_signup, time_since_last_txn, txn_count_24h, txn_count_7d, purchase_velocity_1h, avg_purchase_user, unique_devices_per_user")

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Separate features and target
X = df.drop('class', axis=1)
y = df['class']

# Identify column types
categorical_cols = ['source', 'browser', 'sex', 'country', 'deviceid']
numerical_cols = ['purchasevalue', 'age', 'time_since_signup', 'time_since_last_txn',
                  'txn_count_24h', 'txn_count_7d', 'purchase_velocity_1h',
                  'avg_purchase_user', 'unique_devices_per_user']

# Remove identifiers and datetime from modeling
exclude_cols = ['userid', 'signuptime', 'purchasetime', 'ipaddress', 'ip_int', 'purchase_hour']
X_model = X.drop(columns=exclude_cols, errors='ignore')

# Update column lists after dropping
categorical_cols = [col for col in categorical_cols if col in X_model.columns]
numerical_cols = [col for col in numerical_cols if col in X_model.columns]

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), categorical_cols)
    ])

# Fit and transform
X_processed = preprocessor.fit_transform(X_model)
print(f"Processed shape: {X_processed.shape}")

# Get feature names
cat_encoder = preprocessor.named_transformers_['cat']
cat_features = cat_encoder.get_feature_names_out(categorical_cols)
all_features = list(numerical_cols) + list(cat_features)
print(f"Total features after encoding: {len(all_features)}")

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split

# Train-test split FIRST (IMPORTANT: never apply SMOTE before splitting!)
X_train, X_test, y_train, y_test = train_test_split(
    X_processed, y, test_size=0.2, random_state=42, stratify=y
)

print("="*60)
print("CLASS IMBALANCE HANDLING STRATEGY")
print("="*60)
print(f"\nBefore resampling:")
print(f"Training set shape: {X_train.shape}")
print(f"Fraud cases in training: {y_train.sum()} ({y_train.mean()*100:.2f}%)")
print(f"Non-fraud cases in training: {len(y_train)-y_train.sum()}")

# DECISION: Use SMOTE for moderate oversampling + RandomUnderSampling for majority
# Justification written in markdown cell:
"""
## Justification for Imbalance Handling Strategy

**Why SMOTE?**
- Our fraud rate is extremely low (~1-2% typical for fraud data)
- SMOTE creates synthetic minority samples instead of duplicating
- Helps prevent overfitting compared to random oversampling
- Preserves the information in the majority class

**Why combine with RandomUnderSampling?**
- Pure SMOTE on severe imbalance can lead to overgeneralization
- Under-sampling majority reduces computational cost
- Combination (SMOTEENN or pipeline) often gives better decision boundaries

**Alternative considered:**
- Class weighting in models: Good, but doesn't create new patterns
- ADASYN: Similar to SMOTE but focuses on difficult samples

**Final decision:** SMOTE to 10% fraud rate, then light under-sampling
"""

# Apply SMOTE (oversample minority)
smote = SMOTE(sampling_strategy=0.1, random_state=42)  # Target 10% fraud rate
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"\nAfter SMOTE (10% fraud rate target):")
print(f"Training set shape: {X_train_smote.shape}")
print(f"Fraud cases: {y_train_smote.sum()} ({y_train_smote.mean()*100:.2f}%)")

# Optional: Light under-sampling of majority
rus = RandomUnderSampler(sampling_strategy=0.5, random_state=42)  # 2:1 ratio
X_train_balanced, y_train_balanced = rus.fit_resample(X_train_smote, y_train_smote)

print(f"\nAfter final balancing:")
print(f"Training set shape: {X_train_balanced.shape}")
print(f"Fraud rate: {y_train_balanced.mean()*100:.2f}%")
print(f"Class distribution: {pd.Series(y_train_balanced).value_counts().to_dict()}")

# Visualization
fig, axes = plt.subplots(1, 3, figsize=(15, 5))

# Before
axes[0].pie([len(y_train)-y_train.sum(), y_train.sum()], 
            labels=['Non-Fraud', 'Fraud'], autopct='%1.1f%%',
            colors=['#3498db', '#e74c3c'])
axes[0].set_title('Before Resampling')

# After SMOTE
axes[1].pie([len(y_train_smote)-y_train_smote.sum(), y_train_smote.sum()],
            labels=['Non-Fraud', 'Fraud'], autopct='%1.1f%%',
            colors=['#3498db', '#e74c3c'])
axes[1].set_title('After SMOTE (10%)')

# Final
axes[2].pie([len(y_train_balanced)-y_train_balanced.sum(), y_train_balanced.sum()],
            labels=['Non-Fraud', 'Fraud'], autopct='%1.1f%%',
            colors=['#3498db', '#e74c3c'])
axes[2].set_title('Final Balanced (2:1)')

plt.tight_layout()
plt.savefig('../reports/class_balance_evolution.png', dpi=300)
plt.show()

# Save processed data
processed_data = {
    'X_train': X_train_balanced,
    'X_test': X_test,
    'y_train': y_train_balanced,
    'y_test': y_test,
    'feature_names': all_features,
    'preprocessor': preprocessor
}

import joblib
joblib.dump(processed_data, '../data/processed/train_test_data.pkl')
print("\nâœ“ Processed data saved to ../data/processed/train_test_data.pkl")