In [None]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import f1_score, classification_report, make_scorer
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score
from xgboost import XGBClassifier
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE

# Function to calculate Haversine distance
def haversine(lat1, lon1, lat2, lon2):
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arcsin(np.sqrt(a))
    return R * c

# Features
def preprocess_data(df, is_train=True):

    df['trans_date'] = pd.to_datetime(df['trans_date'])
    df['dob'] = pd.to_datetime(df['dob'])
    df['trans_year'] = df['trans_date'].dt.year
    df['trans_month'] = df['trans_date'].dt.month
    df['trans_day'] = df['trans_date'].dt.day
    df['trans_hour'] = pd.to_datetime(df['trans_time'], format='%H:%M:%S').dt.hour
    df['trans_dayofweek'] = df['trans_date'].dt.dayofweek
    df['is_weekend'] = df['trans_dayofweek'].apply(lambda x: 1 if x >= 5 else 0)
    df['age'] = (df['trans_date'] - df['dob']).dt.days // 365
    df['amt_log'] = np.log1p(df['amt'])  
    df['amt_squared'] = df['amt'] ** 2  
    df['hour_amt_interaction'] = df['trans_hour'] * df['amt']
    df['distance'] = haversine(df['lat'], df['long'], df['merch_lat'], df['merch_long'])

    # Drop old columns
    drop_cols = ['trans_num', 'trans_date', 'trans_time', 'unix_time', 'first', 'last', 'street',
                 'city', 'state', 'zip', 'dob', 'merchant', 'lat', 'long', 'merch_lat', 'merch_long']
    df.drop(columns=drop_cols, inplace=True)

    # Encode categorical variables
    categorical_cols = ['category', 'gender', 'job']
    for col in categorical_cols:
        df[col] = LabelEncoder().fit_transform(df[col])

    if is_train:
        return df.drop(columns=['is_fraud']), df['is_fraud']
    else:
        return df

# Load Data
train = pd.read_csv("/Users/kenneytran/Downloads/CS506EC/train.csv")
test = pd.read_csv("/Users/kenneytran/Downloads/CS506EC/test.csv")

# Preprocess train and test data
X, y = preprocess_data(train)
X_test = preprocess_data(test, is_train=False)

# Feature Selection: Variance Threshold
variance_threshold = 0.1
selector = VarianceThreshold(threshold=variance_threshold)
X = selector.fit_transform(X)
X_test = selector.transform(X_test)

# Convert back to DataFrame after VarianceThreshold
selected_features = train.drop(columns=['is_fraud']).columns[selector.get_support()]
X = pd.DataFrame(X, columns=selected_features)
X_test = pd.DataFrame(X_test, columns=selected_features)

# Add Anomaly Scores (IsoForest and LOF)
# Apply Isolation Forest
iso_forest = IsolationForest(contamination=0.01, random_state=42)
X['IsoForest_Score'] = iso_forest.fit_predict(X)

# Apply Local Outlier Factor
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.01, novelty=True)
lof.fit(X)
X['LOF_Score'] = lof.decision_function(X)

# Add anomaly features to test data
X_test['IsoForest_Score'] = iso_forest.predict(X_test)
X_test['LOF_Score'] = lof.decision_function(X_test)

# Define preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), X.columns.tolist())  # Scale all features including anomaly scores
    ]
)

# Define the model
model = XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    random_state=42
)

# Create the full pipeline with SMOTE
pipeline = ImbPipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', SMOTE(random_state=42)),
    ('classifier', model)
])

# RandomizedSearchCV for Hyperparameter Tuning
param_dist = {
    'classifier__n_estimators': [100, 200, 300, 500],
    'classifier__max_depth': [3, 5, 7, 9, 12],
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
    'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
}

scorer = make_scorer(f1_score)

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    scoring=scorer,
    n_iter=100,  
    cv=10,  
    verbose=2,
    random_state=42,
    n_jobs=-1,
    pre_dispatch='2*n_jobs'
)

# Perform Randomized Search
random_search.fit(X, y)

# Best model and parameters
best_pipeline = random_search.best_estimator_
print("\nBest Parameters from RandomizedSearchCV:", random_search.best_params_)

# Evaluate the model with cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1_scores = cross_val_score(best_pipeline, X, y, cv=skf, scoring=scorer)

print("\nCross-Validation F1-Scores (Best Model):", cv_f1_scores)
print(f"Mean F1-Score: {np.mean(cv_f1_scores):.4f}")
print(f"Standard Deviation of F1-Score: {np.std(cv_f1_scores):.4f}")

# Train the final model on all training data
best_pipeline.fit(X, y)

# Feature Importance from the XGBoost model
feature_importances = best_pipeline.named_steps['classifier'].feature_importances_
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)
print("\nFeature Importance:")
print(importance_df)

# Predict on test data
y_test_pred = best_pipeline.predict(X_test)

# Save predictions to submission file
submission = test[['id']].copy()
submission['is_fraud'] = y_test_pred
submission.to_csv("submission.csv", index=False)

print("\nSubmission file saved as 'submission.csv'")
