In [None]:
!pip install numpy
!pip install pandas
!pip install matplotlib
!pip install seaborn



In [25]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, precision_score, f1_score, confusion_matrix

import pandas as pd

In [None]:
payers = pd.read_csv('payers-v1.csv')
seller = pd.read_csv('seller_terminals-v1.csv')
transactions = pd.read_csv('transactions_train-v1.csv')

In [27]:
payers.info()
seller.info()
transactions.info()

In [None]:
transactions = transactions.rename(columns={
    'card_id': 'CARD_ID', 'terminal_id': 'TERMINAL_ID', 'tx_amount': 'AMOUNT',
    'tx_datetime': 'DATETIME', 'is_fraud': 'IS_FRAUD', 'transaction_id': 'TRANSACTION_ID'
})
transactions['DATETIME'] = pd.to_datetime(transactions['DATETIME'])
transactions['IS_FRAUD'] = transactions['IS_FRAUD'].astype(int)

In [None]:
payers = payers.rename(columns={'card_hash': 'CARD_ID'})
seller = seller.rename(columns={'terminal_id': 'TERMINAL_ID', 'latitude': 'LATITUDE', 'longitude': 'LONGITUDE'})

df_full = transactions.merge(payers, on='CARD_ID', how='left')
df_full = df_full.merge(seller, on='TERMINAL_ID', how='left')
df_full = df_full.sort_values('DATETIME').reset_index(drop=True)

## Feature Engineering

In [None]:
df_full = df_full.fillna(0)
df_full = df_full.set_index('DATETIME')

GROUP_CONFIGS = [
    ('CARD_ID', ['7D', '30D']),
    ('TERMINAL_ID', ['1D', '7D'])
]

for GROUP_COL, TIME_WINDOWS in GROUP_CONFIGS:
    df_grouped = df_full.groupby(GROUP_COL)
    for window in TIME_WINDOWS:
        tx_count_col = f'{GROUP_COL}_TX_COUNT_{window}'
        avg_amount_col = f'{GROUP_COL}_AVG_AMOUNT_{window}'

        tx_count_result = df_grouped['TRANSACTION_ID'].rolling(window=window, closed='left').count().reset_index(level=0, drop=True)
        avg_amount_result = df_grouped['AMOUNT'].rolling(window=window, closed='left').mean().reset_index(level=0, drop=True)

        df_full[tx_count_col] = tx_count_result
        df_full[avg_amount_col] = avg_amount_result

df_full = df_full.reset_index().fillna(0)

## Dataset Splitting

In [None]:
df_full['TX_MONTH'] = df_full['DATETIME'].dt.to_period('M')
all_months = sorted(df_full['TX_MONTH'].unique())
NUM_MONTHS = len(all_months)

TEST_MONTH = all_months[-1]
VAL_MONTH = all_months[-2]
TRAIN_MONTHS = all_months[:-2]

df_train = df_full[df_full['TX_MONTH'].isin(TRAIN_MONTHS)]
df_validation = df_full[df_full['TX_MONTH'] == VAL_MONTH]
df_test = df_full[df_full['TX_MONTH'] == TEST_MONTH]

COLS_TO_DROP = ['IS_FRAUD', 'DATETIME', 'TX_MONTH', 'TRANSACTION_ID', 'CARD_ID', 'TERMINAL_ID', 'LATITUDE', 'LONGITUDE', 'card_first_transaction', 'terminal_operation_start', 'terminal_soft_descriptor']
features = [col for col in df_full.columns if col not in COLS_TO_DROP]

X_train = df_train[features].fillna(0)
y_train = df_train['IS_FRAUD']
X_val = df_validation[features].fillna(0)
y_val = df_validation['IS_FRAUD']
X_test = df_test[features].fillna(0)
y_test = df_test['IS_FRAUD']

# Model Training

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': [3, 5, 7],
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan']
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=3,
    n_jobs=-1,
    verbose=2
)

print('🔹 Training model with GridSearchCV...')
grid_search.fit(X_train, y_train)

print('\n✅ Best parameter combination found:')
print(grid_search.best_params_)
best_model = grid_search.best_estimator_

# Evaluation

In [None]:
y_val_pred = best_model.predict(X_val)

print('\n🔹 Validation set evaluation:')
print(f'Precision: {precision_score(y_val, y_val_pred):.4f}')
print(f'Recall: {recall_score(y_val, y_val_pred):.4f}')
print(f'F1-Score: {f1_score(y_val, y_val_pred):.4f}')
print('\nConfusion Matrix (Validation):')
print(confusion_matrix(y_val, y_val_pred))

y_test_pred = best_model.predict(X_test)

print('\n🔹 Test set evaluation:')
print(f'Precision: {precision_score(y_test, y_test_pred):.4f}')
print(f'Recall: {recall_score(y_test, y_test_pred):.4f}')
print(f'F1-Score: {f1_score(y_test, y_test_pred):.4f}')
print('\nConfusion Matrix (Test):')
print(confusion_matrix(y_test, y_test_pred))