<a href="https://colab.research.google.com/github/cinnData/UMDataWeek-2023/blob/main/Notebooks/fraud2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# [UM-03] Fraud in credit card transactions (cont)

### Stop warnings

In [None]:
import warnings
warnings.filterwarnings("ignore")

### Importing data

In [None]:
import pandas as pd

In [None]:
path = 'https://raw.githubusercontent.com/cinnData/UMDataWeek-2023/main/Data/'
df = pd.read_csv(path + 'fraud_may.csv.zip', index_col=0)
for m in ['jun', 'jul', 'aug', 'sep']:
  df = pd.concat([df, pd.read_csv(path + 'fraud_' + m + '.csv.zip', index_col=0)])

In [None]:
df['date'] = df['TX_DATETIME'].str[:10]

In [None]:
days = df['date'].unique()

### Model specification

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1500)

### Target vector and feature matrix

In [None]:
y = df['TX_FRAUD']

In [None]:
X = df[['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
  'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
  'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
  'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
  'TERMINAL_ID_RISK_30DAY_WINDOW']]

### Q5a. Testing function

In [None]:
def mytest(w):
	t = 7*(w - 1)
	X_train = X[df['date'].between(days[t], days[t+6])]
	y_train = y[df['date'].between(days[t], days[t+6])]
	clf.fit(X_train, y_train)
	df_test = df[df['date'].between(days[t+14], days[t+20])]
	X_test = df_test[['TX_AMOUNT','TX_DURING_WEEKEND', 'TX_DURING_NIGHT', 'CUSTOMER_ID_NB_TX_1DAY_WINDOW',
		'CUSTOMER_ID_AVG_AMOUNT_1DAY_WINDOW', 'CUSTOMER_ID_NB_TX_7DAY_WINDOW',
		'CUSTOMER_ID_AVG_AMOUNT_7DAY_WINDOW', 'CUSTOMER_ID_NB_TX_30DAY_WINDOW',
		'CUSTOMER_ID_AVG_AMOUNT_30DAY_WINDOW', 'TERMINAL_ID_NB_TX_1DAY_WINDOW',
		'TERMINAL_ID_RISK_1DAY_WINDOW', 'TERMINAL_ID_NB_TX_7DAY_WINDOW',
		'TERMINAL_ID_RISK_7DAY_WINDOW', 'TERMINAL_ID_NB_TX_30DAY_WINDOW',
		'TERMINAL_ID_RISK_30DAY_WINDOW']]
	df_test['score'] = clf.predict_proba(X_test)[:, 1]
	def prec100(d):
		data = df_test[df_test['date'] == d]
		data = data.sort_values('score', ascending=False).head(100)
		prec = (data['TX_FRAUD'] == 1).sum()
		return prec
	def total_fraud(d):
		return df_test.loc[df_test['date'] == d, 'TX_FRAUD'].sum()
	return pd.DataFrame({'total_fraud': [total_fraud(d) for d in df_test['date'].unique()],
		'capt_fraud': [prec100(d) for d in df_test['date'].unique()]})

### Q5b. Packing

In [None]:
mytable = mytest(1)

In [None]:
for w in range(2, 21):
	mytable = pd.concat([mytable, mytest(w)])

In [None]:
mytable.index = range(140)

### Q5c. Plotting

In [None]:
mytable.plot(figsize=(10,5));