In [45]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

In [46]:
# Load datasets
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [47]:
# Feature engineering
# 1. Convert unix_time to hour (use modulo 24 to extract the hour)
train['hour'] = pd.to_datetime(train['unix_time'], unit='s').dt.hour
test['hour'] = pd.to_datetime(test['unix_time'], unit='s').dt.hour

# 2. Combine first and last name into one feature (if they exist)
train['full_name'] = train['first'] + ' ' + train['last']
test['full_name'] = test['first'] + ' ' + test['last']

# 3. Convert dob to age based on 2024 (current year)
train['dob'] = pd.to_datetime(train['dob'], errors='coerce')
test['dob'] = pd.to_datetime(test['dob'], errors='coerce')

train['age'] = 2024 - train['dob'].dt.year
test['age'] = 2024 - test['dob'].dt.year

In [48]:
# Drop unnecessary columns
drop_columns = ['trans_num', 'trans_date', 'trans_time', 'first', 'last', 
                'street', 'long', 'lat', 'city_pop', 'merch_lat', 'merch_long']
train.drop(columns=drop_columns, inplace=True)
test.drop(columns=drop_columns, inplace=True)

# Identify categorical features
categorical_columns = ['category', 'gender', 'city', 'state', 'job', 'merchant', 'full_name']  # Add more categorical columns if needed


In [49]:
# Prepare the feature set and target variable
X = train.drop(columns=['is_fraud'])
y = train['is_fraud']
X_test = test

In [50]:
# Split train data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Initialize and train the CatBoost model
model = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss', verbose=200)

# Pass the categorical feature indices
cat_feature_indices = [X.columns.get_loc(col) for col in categorical_columns]

model.fit(X_train, y_train, eval_set=(X_val, y_val), use_best_model=True, cat_features=cat_feature_indices)


0:	learn: 0.5443317	test: 0.5439176	best: 0.5439176 (0)	total: 477ms	remaining: 7m 56s
200:	learn: 0.0284685	test: 0.0280762	best: 0.0280762 (200)	total: 1m 35s	remaining: 6m 20s


In [None]:
# Make predictions on the validation set
y_val_pred = model.predict(X_val)

In [None]:
# Calculate the F1 score
f1 = f1_score(y_val, y_val_pred)
print(f'F1 Score on Validation Set: {f1}')

# Show detailed classification report
print("Classification Report on Validation Set:")
print(classification_report(y_val, y_val_pred))

F1 Score on Validation Set: 0.9853296411948389
Classification Report on Validation Set:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65592
           1       0.99      0.98      0.99      8549

    accuracy                           1.00     74141
   macro avg       0.99      0.99      0.99     74141
weighted avg       1.00      1.00      1.00     74141



In [None]:
# Make predictions on the test set (for submission)
y_test_pred = model.predict(X_test)

In [None]:
submission = pd.DataFrame({
    'id': sample_submission['id'],
    'is_fraud': y_test_pred
})

In [None]:
# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")


Submission file saved as submission.csv
