In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, classification_report
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

# ... (previous imports remain the same)

class DataPreprocessor:
    def __init__(self):
        self.encoders = {}
        self.scaler = None
        
    def preprocess(self, df, training=True):
        print(f"Preprocessing {'training' if training else 'test'} data...")
        df = df.copy()
        
        # Time features
        df['datetime'] = pd.to_datetime(df['unix_time'], unit='s')
        df['hour'] = df['datetime'].dt.hour
        df['day'] = df['datetime'].dt.day
        df['month'] = df['datetime'].dt.month
        df['day_of_week'] = df['datetime'].dt.dayofweek
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        df['is_night'] = df['hour'].isin(list(range(23)) + list(range(0, 5))).astype(int)
        df['is_business_hour'] = df['hour'].isin(range(9, 18)).astype(int)
        
        # Location features
        df['distance'] = np.sqrt(
            ((df['lat'] - df['merch_lat']) ** 2) + 
            ((df['long'] - df['merch_long']) ** 2)
        )
        
        # Amount features
        df['log_amount'] = np.log1p(df['amt'])
        df['amount_per_pop'] = df['amt'] / (df['city_pop'] + 1)
        
        if training:
            # Simplified statistics calculations
            self.category_mean = df.groupby('category')['amt'].mean()
            self.category_std = df.groupby('category')['amt'].std()
            self.category_risk = df.groupby('category')['is_fraud'].mean()
            
            self.merchant_mean = df.groupby('merchant')['amt'].mean()
            self.merchant_std = df.groupby('merchant')['amt'].std()
            self.merchant_risk = df.groupby('merchant')['is_fraud'].mean()
            
            self.hour_mean = df.groupby('hour')['amt'].mean()
            self.hour_std = df.groupby('hour')['amt'].std()
            self.hour_risk = df.groupby('hour')['is_fraud'].mean()
        
        # Map features
        df['category_mean_amt'] = df['category'].map(self.category_mean).fillna(0)
        df['category_std_amt'] = df['category'].map(self.category_std).fillna(0)
        df['category_risk'] = df['category'].map(self.category_risk).fillna(0)
        
        df['merchant_mean_amt'] = df['merchant'].map(self.merchant_mean).fillna(0)
        df['merchant_std_amt'] = df['merchant'].map(self.merchant_std).fillna(0)
        df['merchant_risk'] = df['merchant'].map(self.merchant_risk).fillna(0)
        
        df['hour_mean_amt'] = df['hour'].map(self.hour_mean).fillna(0)
        df['hour_std_amt'] = df['hour'].map(self.hour_std).fillna(0)
        df['hour_risk'] = df['hour'].map(self.hour_risk).fillna(0)
        
        # Risk scores
        df['amount_risk_score'] = df['log_amount'] * df['category_risk']
        df['amount_merchant_risk'] = df['log_amount'] * df['merchant_risk']
        df['time_risk_score'] = df['hour_risk'] * df['category_risk']
        
        # Transaction patterns
        df['tx_count_hour'] = df.groupby(['cc_num', 'hour'])['datetime'].transform('count')
        df['tx_count_day'] = df.groupby(['cc_num', 'day'])['datetime'].transform('count')
        
        numerical_features = [
            'amt', 'log_amount', 'distance', 'amount_per_pop',
            'hour', 'day', 'month', 'day_of_week', 'is_weekend',
            'is_night', 'is_business_hour',
            'category_mean_amt', 'category_std_amt', 'category_risk',
            'merchant_mean_amt', 'merchant_std_amt', 'merchant_risk',
            'hour_mean_amt', 'hour_std_amt', 'hour_risk',
            'amount_risk_score', 'amount_merchant_risk', 'time_risk_score',
            'tx_count_hour', 'tx_count_day',
            'city_pop', 'lat', 'long', 'merch_lat', 'merch_long'
        ]
        
        categorical_features = ['category', 'gender', 'state', 'job']
        
        # Encode categorical features
        for col in categorical_features:
            if training:
                le = LabelEncoder()
                df[col + '_encoded'] = le.fit_transform(df[col])
                self.encoders[col] = le
            else:
                le = self.encoders[col]
                df[col] = df[col].map(lambda x: 'unknown' if x not in le.classes_ else x)
                df[col + '_encoded'] = le.transform(df[col])
            numerical_features.append(col + '_encoded')
        
        X = df[numerical_features].fillna(0)
        
        if training:
            self.scaler = StandardScaler()
            X_scaled = self.scaler.fit_transform(X)
            y = df['is_fraud']
            return X_scaled, y
        return self.scaler.transform(X)

# Load and preprocess data
train_df = pd.read_csv(r"C:\Users\arufa\Documents\CS506\EXTRA_CREDIT\train.csv")
test_df = pd.read_csv(r"C:\Users\arufa\Documents\CS506\EXTRA_CREDIT\test.csv")

print("\nTraining set shape:", train_df.shape)
print("Test set shape:", test_df.shape)

# Preprocess data
preprocessor = DataPreprocessor()
X, y = preprocessor.preprocess(train_df, training=True)

# Create XGBoost with proven parameters
xgb_model = xgb.XGBClassifier(
    scale_pos_weight = len(y[y == 0]) / len(y[y == 1]) * 1.2 ,
    learning_rate=0.01,
    n_estimators=1000,  # Updated
    max_depth=12,
    min_child_weight=3,
    gamma=0.1,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.08,
    reg_lambda=0.8,
    random_state=42,
    tree_method='hist'
)

# Fast cross-validation
skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]
    
    xgb_model.fit(X_train, y_train)
    
    y_pred = xgb_model.predict(X_val)
    score = f1_score(y_val, y_pred)
    scores.append(score)
    print(f"\nFold {fold} F1-score: {score:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))

print(f"\nAverage CV F1-score: {np.mean(scores):.4f}")

# Generate predictions
X_test = preprocessor.preprocess(test_df, training=False)
test_predictions = xgb_model.predict(X_test)

# Create submission
submission = pd.DataFrame({
    'id': test_df['id'],
    'is_fraud': test_predictions
})

submission_path = r"C:\Users\arufa\Documents\CS506\EXTRA_CREDIT\submission5.csv"
submission.to_csv(submission_path, index=False)


Training set shape: (370703, 24)
Test set shape: (92676, 23)
Preprocessing training data...
