In [30]:
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

In [31]:
# Load datasets
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')

In [32]:
# Feature engineering
# 1. Convert unix_time to hour (use modulo 24 to extract the hour)
train['hour'] = pd.to_datetime(train['unix_time'], unit='s').dt.hour
test['hour'] = pd.to_datetime(test['unix_time'], unit='s').dt.hour

# 2. Combine first and last name into one feature
train['full_name'] = train['first'] + ' ' + train['last']
test['full_name'] = test['first'] + ' ' + test['last']

# 3. Convert dob to age based on 2024 (current year)
train['dob'] = pd.to_datetime(train['dob'], errors='coerce')
test['dob'] = pd.to_datetime(test['dob'], errors='coerce')

train['age'] = 2024 - train['dob'].dt.year
test['age'] = 2024 - test['dob'].dt.year


In [33]:
# Drop unnecessary columns
drop_columns = ['trans_num', 'trans_date', 'trans_time', 'first', 'last', 'long', 'lat', 'merch_lat', 'merch_long']
train.drop(columns=drop_columns, inplace=True)
test.drop(columns=drop_columns, inplace=True)

# Identify categorical features
categorical_columns = ['category', 'gender', 'city', 'state', 'job', 'merchant', 'full_name', 'street']


In [34]:
# Prepare the feature set and target variable
X = train.drop(columns=['is_fraud'])
y = train['is_fraud']
X_test = test

In [35]:
# Split train data into training and validation sets
#X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
# Initialize and train the CatBoost model
model = CatBoostClassifier(iterations=5000, depth=6, learning_rate=0.7, loss_function='Logloss', verbose=200)

# Pass the categorical feature indices
cat_feature_indices = [X.columns.get_loc(col) for col in categorical_columns]

# Train the model on the entire training dataset
model.fit(X, y, cat_features=cat_feature_indices)


0:	learn: 0.1373031	total: 538ms	remaining: 44m 49s
200:	learn: 0.0071388	total: 1m 38s	remaining: 39m 18s
400:	learn: 0.0050773	total: 3m 10s	remaining: 36m 20s
600:	learn: 0.0040147	total: 4m 37s	remaining: 33m 48s
800:	learn: 0.0036021	total: 5m 53s	remaining: 30m 55s
1000:	learn: 0.0032795	total: 7m 8s	remaining: 28m 32s
1200:	learn: 0.0029315	total: 8m 25s	remaining: 26m 39s
1400:	learn: 0.0026518	total: 9m 43s	remaining: 25m
1600:	learn: 0.0025728	total: 11m 1s	remaining: 23m 24s
1800:	learn: 0.0024187	total: 12m 20s	remaining: 21m 55s
2000:	learn: 0.0023541	total: 13m 33s	remaining: 20m 18s
2200:	learn: 0.0022444	total: 14m 50s	remaining: 18m 52s
2400:	learn: 0.0022035	total: 16m 4s	remaining: 17m 24s
2600:	learn: 0.0021467	total: 17m 20s	remaining: 15m 59s
2800:	learn: 0.0021173	total: 18m 41s	remaining: 14m 40s
3000:	learn: 0.0020932	total: 20m	remaining: 13m 19s
3200:	learn: 0.0020817	total: 21m 23s	remaining: 12m 1s
3400:	learn: 0.0020595	total: 22m 43s	remaining: 10m 40s
36

<catboost.core.CatBoostClassifier at 0x1afe47ff010>

In [37]:
# Make predictions on the validation set
y_val_pred = model.predict(X_val)

In [38]:
# Calculate the F1 score
#f1 = f1_score(y_val, y_val_pred)
#print(f'F1 Score on Validation Set: {f1}')

# Show detailed classification report
#print("Classification Report on Validation Set:")
#print(classification_report(y_val, y_val_pred))

In [39]:
# Make predictions on the test set (for submission)
y_test_pred = model.predict(X_test)

In [40]:
submission = pd.DataFrame({
    'id': sample_submission['id'],
    'is_fraud': y_test_pred
})

In [41]:
# Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file saved as submission.csv")


Submission file saved as submission.csv
