In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import f1_score, classification_report
import warnings
warnings.filterwarnings('ignore')

def preprocess_data(df, label_encoders=None, is_training=True):
    df_processed = df.copy()

    # Process dates
    df_processed['trans_date'] = pd.to_datetime(df_processed['trans_date'])
    df_processed['trans_time'] = pd.to_datetime(df_processed['trans_time'])
    df_processed['dob'] = pd.to_datetime(df_processed['dob'])
    df_processed['hour'] = df_processed['trans_time'].dt.hour
    df_processed['day'] = df_processed['trans_date'].dt.day
    df_processed['month'] = df_processed['trans_date'].dt.month
    df_processed['weekday'] = df_processed['trans_date'].dt.weekday
    df_processed['year'] = df_processed['trans_date'].dt.year

    # Time-based features
    df_processed['is_weekend'] = df_processed['weekday'].isin([5, 6]).astype(int)
    df_processed['is_night'] = ((df_processed['hour'] >= 22) | (df_processed['hour'] <= 5)).astype(int)
    df_processed['is_morning'] = ((df_processed['hour'] >= 6) & (df_processed['hour'] <= 11)).astype(int)

    # Calculate distance using the given lat and long
    df_processed['distance'] = np.sqrt(
        (df_processed['lat'] - df_processed['merch_lat'])**2 +
        (df_processed['long'] - df_processed['merch_long'])**2
    )

    # Age
    df_processed['age'] = (df_processed['trans_date'] - df_processed['dob']).dt.days / 365.25

    # Amount
    df_processed['amount_log'] = np.log1p(df_processed['amt'])

    # Population density
    df_processed['city_pop_log'] = np.log1p(df_processed['city_pop'])

    # Drop the not needed columns
    columns_to_drop = ['trans_date', 'trans_time', 'cc_num', 'first', 'last',
                      'street', 'city', 'dob', 'unix_time', 'zip', 'trans_num']
    df_processed = df_processed.drop(columns=columns_to_drop, errors='ignore')

    # Categorical encoding
    cat_columns = ['category', 'gender', 'state', 'job', 'merchant']

    if is_training:
        label_encoders = {}
        for col in cat_columns:
            label_encoders[col] = LabelEncoder()
            df_processed[col] = label_encoders[col].fit_transform(df_processed[col].astype(str))
        return df_processed, label_encoders
    else:
        for col in cat_columns:
            df_processed[col] = df_processed[col].astype(str)
            unseen = ~df_processed[col].isin(label_encoders[col].classes_)
            df_processed.loc[unseen, col] = label_encoders[col].classes_[0]
            df_processed[col] = label_encoders[col].transform(df_processed[col])
        return df_processed

def prepare_features(df_processed, is_training=True, agg_stats=None):

    feature_columns = [
        'category', 'amt', 'lat', 'long', 'city_pop', 'job',
        'merch_lat', 'merch_long', 'hour', 'day', 'month', 'weekday',
        'is_weekend', 'is_night', 'amount_log', 'distance', 'age',
        'gender', 'state', 'merchant'
    ]

    df_features = df_processed[feature_columns].copy()

    df_features['hour_sin'] = np.sin(2 * np.pi * df_features['hour']/24)
    df_features['hour_cos'] = np.cos(2 * np.pi * df_features['hour']/24)
    df_features['day_sin'] = np.sin(2 * np.pi * df_features['day']/31)
    df_features['day_cos'] = np.cos(2 * np.pi * df_features['day']/31)
    df_features['month_sin'] = np.sin(2 * np.pi * df_features['month']/12)
    df_features['month_cos'] = np.cos(2 * np.pi * df_features['month']/12)
    df_features['weekday_sin'] = np.sin(2 * np.pi * df_features['weekday']/7)
    df_features['weekday_cos'] = np.cos(2 * np.pi * df_features['weekday']/7)

    # Time features
    df_features['is_late_night'] = ((df_features['hour'] >= 23) | (df_features['hour'] <= 4)).astype(int)
    df_features['is_business_hour'] = ((df_features['hour'] >= 9) & (df_features['hour'] <= 17)).astype(int)
    df_features['is_evening'] = ((df_features['hour'] >= 18) & (df_features['hour'] <= 22)).astype(int)

    # Amount features
    df_features['amount_sqrt'] = np.sqrt(df_features['amt'])
    df_features['amount_squared'] = df_features['amt'] ** 2

    # location features
    df_features['distance_log'] = np.log1p(df_features['distance'])
    df_features['city_pop_log'] = np.log1p(df_features['city_pop'])

    # others
    df_features['amount_per_distance'] = df_features['amt'] / (df_features['distance'] + 1)
    df_features['amount_per_pop'] = df_features['amt'] / (df_features['city_pop'] + 1)
    df_features['pop_per_distance'] = df_features['city_pop'] / (df_features['distance'] + 1)
    if is_training:
        agg_stats = {}
        for col in ['category', 'merchant', 'state']:
            agg_stats[f'{col}_amt_mean'] = df_features.groupby(col)['amt'].mean()
            agg_stats[f'{col}_amt_std'] = df_features.groupby(col)['amt'].std()
            agg_stats[f'{col}_amt_median'] = df_features.groupby(col)['amt'].median()
            agg_stats[f'{col}_distance_mean'] = df_features.groupby(col)['distance'].mean()
            agg_stats[f'{col}_distance_std'] = df_features.groupby(col)['distance'].std()
            agg_stats[f'{col}_tx_count'] = df_features.groupby(col)['amt'].count()

            df_features[f'{col}_amt_mean'] = df_features[col].map(agg_stats[f'{col}_amt_mean'])
            df_features[f'{col}_amt_std'] = df_features[col].map(agg_stats[f'{col}_amt_std'])
            df_features[f'{col}_amt_median'] = df_features[col].map(agg_stats[f'{col}_amt_median'])
            df_features[f'{col}_amt_to_mean'] = df_features['amt'] / (df_features[f'{col}_amt_mean'] + 1)
            df_features[f'{col}_amt_to_median'] = df_features['amt'] / (df_features[f'{col}_amt_median'] + 1)
            df_features[f'{col}_distance_mean'] = df_features[col].map(agg_stats[f'{col}_distance_mean'])
            df_features[f'{col}_distance_std'] = df_features[col].map(agg_stats[f'{col}_distance_std'])
            df_features[f'{col}_tx_count'] = df_features[col].map(agg_stats[f'{col}_tx_count'])
    else:
        for col in ['category', 'merchant', 'state']:
            df_features[f'{col}_amt_mean'] = df_features[col].map(agg_stats[f'{col}_amt_mean'])
            df_features[f'{col}_amt_std'] = df_features[col].map(agg_stats[f'{col}_amt_std'])
            df_features[f'{col}_amt_median'] = df_features[col].map(agg_stats[f'{col}_amt_median'])
            df_features[f'{col}_amt_to_mean'] = df_features['amt'] / (df_features[f'{col}_amt_mean'] + 1)
            df_features[f'{col}_amt_to_median'] = df_features['amt'] / (df_features[f'{col}_amt_median'] + 1)
            df_features[f'{col}_distance_mean'] = df_features[col].map(agg_stats[f'{col}_distance_mean'])
            df_features[f'{col}_distance_std'] = df_features[col].map(agg_stats[f'{col}_distance_std'])
            df_features[f'{col}_tx_count'] = df_features[col].map(agg_stats[f'{col}_tx_count'])

    for col in df_features.columns: # simple data processing
        df_features[col] = df_features[col].replace([np.inf, -np.inf], 0)
        df_features[col] = df_features[col].fillna(0)

    print(df_features.dtypes)
    print(f"\nTotal number of features used {len(df_features.columns)}")

    if is_training:
        target = df_processed['is_fraud']
        return df_features, target, agg_stats

    return df_features

def make_predictions_and_submit(model, X_test, test_ids, output_path):
    dtest = xgb.DMatrix(X_test)
    y_pred = model.predict(dtest)
    submission = pd.DataFrame({
        'id': test_ids,
        'is_fraud': (y_pred > 0.5).astype(int)
    })
    submission.to_csv(output_path, index=False)
    return submission

def main():
    train_df = pd.read_csv('/content/train (2).csv')
    test_df = pd.read_csv('/content/test (2).csv')

    print("\nPreprocessing training data.")
    df_processed, label_encoders = preprocess_data(train_df, is_training=True)
    X, y, agg_stats = prepare_features(df_processed, is_training=True)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])

    tree_params = { #tune these parameter
        'objective': 'binary:logistic',
        'booster': 'gbtree',
        'scale_pos_weight': scale_pos_weight,
        'max_depth': 6,
        'min_child_weight': 1,
        'gamma': 0.1,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'eta': 0.1,
        'eval_metric': 'auc'
    }

    dtrain = xgb.DMatrix(X_train_scaled, label=y_train)
    dval = xgb.DMatrix(X_val_scaled, label=y_val)
    print("\n training data.")
    model = xgb.train(
        tree_params,
        dtrain,
        num_boost_round=1000,
        evals=[(dtrain, 'train'), (dval, 'val')],
        verbose_eval=100
    )
    y_pred_proba = model.predict(dval)
    y_pred = (y_pred_proba > 0.5).astype(int)

    print("\nModel Evaluation:")
    print("F1 Score:", f1_score(y_val, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_val, y_pred))

    print("\nProcessing test data")
    test_processed = preprocess_data(test_df, label_encoders=label_encoders, is_training=False)
    X_test = prepare_features(test_processed, is_training=False, agg_stats=agg_stats)
    X_test_scaled = scaler.transform(X_test)

    submission = make_predictions_and_submit(
        model,
        X_test_scaled,
        test_df['id'],
        '/content/finalsub.csv'
    )

    return model, submission

if __name__ == "__main__":
    model, submission = main()


Preprocessing training data.
category                 int64
amt                    float64
lat                    float64
long                   float64
city_pop                 int64
                        ...   
state_amt_to_mean      float64
state_amt_to_median    float64
state_distance_mean    float64
state_distance_std     float64
state_tx_count           int64
Length: 62, dtype: object

Total number of features used 62

 training data.
[0]	train-auc:0.98228	val-auc:0.98177
[100]	train-auc:0.99912	val-auc:0.99897
[200]	train-auc:0.99982	val-auc:0.99944
[300]	train-auc:0.99994	val-auc:0.99946
[400]	train-auc:0.99998	val-auc:0.99945
[500]	train-auc:0.99999	val-auc:0.99945
[600]	train-auc:1.00000	val-auc:0.99944
[700]	train-auc:1.00000	val-auc:0.99943
[800]	train-auc:1.00000	val-auc:0.99940
[900]	train-auc:1.00000	val-auc:0.99941
[999]	train-auc:1.00000	val-auc:0.99940

Model Evaluation:
F1 Score: 0.9775407407407407

Classification Report:
              precision    recall  f1-scor