<a href="https://colab.research.google.com/github/cinnData/UMDataWeek-2023/blob/main/Notebooks/fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [UM-03] Fraud in credit card transactions

### Stop warnings

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Importing data

In [None]:
import pandas as pd

In [None]:
path = 'https://raw.githubusercontent.com/cinnData/UMDataWeek-2023/main/Data/'
df = pd.read_csv(path + 'fraud_may.csv.zip', index_col=0)
for m in ['jun', 'jul', 'aug', 'sep']:
  df = pd.concat([df, pd.read_csv(path + 'fraud_' + m + '.csv.zip', index_col=0)])

In [None]:
df.info()

In [None]:
df['date'] = df['TX_DATETIME'].str[:10]

### Fraud rate

In [None]:
df['TX_FRAUD'].sum()

In [None]:
df['TX_FRAUD'].mean().round(4)

### Q1a. Target vector and feature matrix

In [None]:
y = df['TX_FRAUD']

In [None]:
X = df[['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
  'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
  'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
  'TERMINAL_ID_RISK_30DAY_WINDOW']]

### Training data

In [None]:
X_train = X[df['date'].between('2018-04-30', '2018-05-06')]

In [None]:
X_train.shape

In [None]:
y_train = y[df['date'].between('2018-04-30', '2018-05-06')]

### Q2a. Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1500)

In [None]:
clf.fit(X_train, y_train);

### Q2b. Scoring

In [None]:
df_test = df[df['date'].between('2018-05-14', '2018-05-20')]
df_test.shape

In [None]:
X_test = df_test[['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
  'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
  'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
  'TERMINAL_ID_RISK_30DAY_WINDOW']]

In [None]:
df_test['score'] = clf.predict_proba(X_test)[:, 1]

### Q2c. Precision top-100

In [None]:
def prec100(d):
  data = df_test[df_test['date'] == d]
  data = data.sort_values('score', ascending=False).head(100)
  prec = (data['TX_FRAUD'] == 1).sum()
  return prec

In [None]:
[prec100(d) for d in df_test['date'].unique()]

### Q3. Card precision top-100

In [None]:
def cprec100(d):
  data = df_test[df_test['date'] == d]
  data = data.groupby('CUSTOMER_ID')[['TX_FRAUD', 'score']].max()
  data = data.sort_values('score', ascending=False).head(100)
  cprec = (data['TX_FRAUD'] == 1).sum()
  return cprec

In [None]:
[cprec100(d) for d in df_test['date'].unique()]

### Q4a. Total fraud and detected fraud

In [None]:
def total_fraud(d):
  return df_test.loc[df_test['date'] == d, 'TX_FRAUD'].sum()

In [None]:
pd.DataFrame({'total_fraud': [total_fraud(d) for d in df_test['date'].unique()],
  'capt_fraud': [prec100(d) for d in df_test['date'].unique()]})

### Q4b. Total and detected fraudulent cards

In [None]:
def total_card_fraud(d):
  data = df_test[df_test['date'] == d].groupby('CUSTOMER_ID')[['TX_FRAUD', 'score']].max()
  return data['TX_FRAUD'].sum()

In [None]:
pd.DataFrame({'total_card_fraud': [total_card_fraud(d) for d in df_test['date'].unique()],
  'capt_card_fraud': [cprec100(d) for d in df_test['date'].unique()]})