In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [27]:
# Load data
train_path = 'data/train.csv'
test_path = 'data/test.csv'
sample_submission_path = 'data/sample_submission.csv'

# Read data
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

In [23]:
# Feature Engineering
def preprocess_and_engineer_features(df):
    # Date and time features
    df['trans_date'] = pd.to_datetime(df['trans_date'])
    df['trans_year'] = df['trans_date'].dt.year
    df['trans_month'] = df['trans_date'].dt.month
    df['trans_day'] = df['trans_date'].dt.day
    df['trans_weekday'] = df['trans_date'].dt.weekday
    df['is_weekend'] = df['trans_weekday'].isin([5, 6]).astype(int)
    
    # Calculate distance between user and merchant
    df['distance'] = np.sqrt((df['lat'] - df['merch_lat'])**2 + (df['long'] - df['merch_long'])**2)
    
    # Log-transform transaction amount
    df['log_amt'] = np.log1p(df['amt'])
    
    # Extract age from DOB
    df['dob'] = pd.to_datetime(df['dob'])
    df['age'] = (df['trans_date'] - df['dob']).dt.days // 365
    
    # Encode categorical features
    df = pd.get_dummies(df, columns=['category', 'gender', 'state'], drop_first=True)
    
    return df

In [24]:
# Preprocess datasets
train_df = preprocess_and_engineer_features(train_df)
test_df = preprocess_and_engineer_features(test_df)

In [25]:
# Separate features and target
target = 'is_fraud'
features = train_df.drop(columns=['id', 'trans_num', 'trans_date', 'trans_time', 'dob', 'is_fraud'])
test_features = test_df.drop(columns=['id', 'trans_num', 'trans_date', 'trans_time', 'dob'])

# Separate numerical and categorical columns
numerical_cols = features.select_dtypes(include=['float64', 'int64']).columns
categorical_cols = features.select_dtypes(include=['object']).columns

# Define preprocessing for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# Preprocess the data
X = features
y = train_df[target]
X_test = test_features

In [26]:
# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the preprocessor
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(X_test)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
model.fit(X_train, y_train)

# Evaluate the model
y_val_pred = model.predict(X_val)
val_f1 = f1_score(y_val, y_val_pred)
print(f"Validation F1-Score: {val_f1:.4f}")

# Predict on the test dataset
test_predictions = model.predict(X_test)

# Create the submission file
submission = sample_submission.copy()
submission['is_fraud'] = test_predictions
submission.to_csv('submission.csv', index=False)

print("Submission file 'submission.csv' created successfully.")

Validation F1-Score: 0.7955
Submission file 'submission.csv' created successfully.
