# 1.) Import the Credit Card Fraud Data From CCLE

In [2]:
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np

In [3]:
drive.mount('/content/gdrive/', force_remount = True)

Mounted at /content/gdrive/


In [4]:
df = pd.read_csv("/content/gdrive/MyDrive/Econ441B/fraudTest.csv")

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


# 2.) Select four columns to use as features (one just be trans_date_trans)

In [6]:
df_select = df[["trans_date_trans_time", "category", "amt", "city_pop", "is_fraud"]]

In [7]:
df_select.head()

Unnamed: 0,trans_date_trans_time,category,amt,city_pop,is_fraud
0,2020-06-21 12:14:25,personal_care,2.86,333497,0
1,2020-06-21 12:14:33,personal_care,29.84,302,0
2,2020-06-21 12:14:53,health_fitness,41.28,34496,0
3,2020-06-21 12:15:15,misc_pos,60.05,54767,0
4,2020-06-21 12:15:17,travel,3.19,1126,0


# 3.) Create a unique variable out of trans_date.

In [8]:
#create your own variables

In [9]:
type(df['trans_date_trans_time'][0])

str

In [10]:
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])

In [11]:
df_select["time_var"] = [i.second for i in df ["trans_date_trans_time"]]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["time_var"] = [i.second for i in df ["trans_date_trans_time"]]


In [12]:
dummies = pd.get_dummies(df_select["category"])
X = pd.concat([dummies, df_select[["amt", "city_pop", "time_var"]]], axis = 1)
y = df_select["is_fraud"]

In [13]:
X.head()

Unnamed: 0,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel,amt,city_pop,time_var
0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2.86,333497,25
1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,29.84,302,33
2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,41.28,34496,53
3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,60.05,54767,15
4,0,0,0,0,0,0,0,0,0,0,0,0,0,1,3.19,1126,17


# 4.) Oversample the data (this will be your training data). Skip Q4

In [14]:
resample_X = X
resample_y = y

# 5.) Train a Logistic regression.

In [27]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
scaler = StandardScaler()
X_normalized = scaler.fit_transform(resample_X)

In [32]:
split_size = int(len(X)*0.8)
x_train, x_test = X[:split_size], X[split_size:]
y_train, y_test = y[:split_size], y[split_size:]

In [33]:
log_reg = LogisticRegression().fit(x_train, y_train)

# 6.) The company you are working for wants to target at a False Positive rate of 5% what threshold should you use? (Use oversampled data)

In [22]:
from sklearn.metrics import confusion_matrix

In [43]:
probs = log_reg.predict_proba(x_test)

threshold = 0
for t in np.arange(0, 1, 0.01):
  preds = probs[:,1] > t
  cm = confusion_matrix(y_test, preds)
  fpr = cm[0,1] / (cm[0,0] + cm [0,1])
  if fpr <= 0.05:
    thredshold = t
    break

In [42]:
# get the confusion matrix
confusion_matrix(y_test, preds)

array([[109717,   1243],
       [   100,     84]])

In [44]:
print("threshold:", thredshold)

threshold: 0.01


# 7.) If the company makes .02*amt on True transactions and loses -amt on False (Use original data)

In [45]:
preds = probs[:,1]>0.01
x_test

Unnamed: 0,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,personal_care,shopping_net,shopping_pos,travel,amt,city_pop,time_var
444575,0,0,0,0,0,0,1,0,0,0,0,0,0,0,2.01,1190,7
444576,0,1,0,0,0,0,0,0,0,0,0,0,0,0,163.80,1563,7
444577,0,0,0,0,0,0,0,0,0,0,0,0,1,0,2.33,1228,7
444578,0,0,0,0,0,0,0,0,1,0,0,0,0,0,163.97,564,52
444579,0,0,0,0,0,0,0,0,0,0,0,0,0,1,9.08,6284,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,0,0,0,0,0,1,0,0,0,0,0,0,0,0,43.77,519,7
555715,0,0,0,0,0,0,0,1,0,0,0,0,0,0,111.84,28739,9
555716,0,0,0,0,0,0,0,1,0,0,0,0,0,0,86.88,3684,15
555717,0,0,0,0,0,0,0,0,0,0,0,0,0,1,7.99,129,24


In [46]:
x_test["Preds"] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test["Preds"] = preds


In [61]:
x_test["signal"] = x_test["Preds"]* y_test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test["signal"] = x_test["Preds"]* y_test


In [62]:
df_temp = pd.concat([x_test, y_test], axis = 1 )

df_temp["Preds"] = [0 if i == False else 1 for i in df_temp["Preds"]]

df_temp["FN"] = (df_temp["is_fraud"] == 1) & (df_temp["Preds"] == 0)

df_temp 

Unnamed: 0,entertainment,food_dining,gas_transport,grocery_net,grocery_pos,health_fitness,home,kids_pets,misc_net,misc_pos,...,shopping_net,shopping_pos,travel,amt,city_pop,time_var,Preds,signal,is_fraud,FN
444575,0,0,0,0,0,0,1,0,0,0,...,0,0,0,2.01,1190,7,0,0,0,False
444576,0,1,0,0,0,0,0,0,0,0,...,0,0,0,163.80,1563,7,0,0,0,False
444577,0,0,0,0,0,0,0,0,0,0,...,0,1,0,2.33,1228,7,0,0,0,False
444578,0,0,0,0,0,0,0,0,1,0,...,0,0,0,163.97,564,52,0,0,0,False
444579,0,0,0,0,0,0,0,0,0,0,...,0,0,1,9.08,6284,3,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
555714,0,0,0,0,0,1,0,0,0,0,...,0,0,0,43.77,519,7,0,0,0,False
555715,0,0,0,0,0,0,0,1,0,0,...,0,0,0,111.84,28739,9,0,0,0,False
555716,0,0,0,0,0,0,0,1,0,0,...,0,0,0,86.88,3684,15,0,0,0,False
555717,0,0,0,0,0,0,0,0,0,0,...,0,0,1,7.99,129,24,0,0,0,False


In [64]:
df_temp ["profit"] = df_temp["signal"] * 0.02 * df_temp["amt"] + df_temp ["FN"] * (-df_temp["amt"])

In [65]:
df_temp["profit"].sum()

-20229.7552

# 8.) Using Logistic Regression Lasso to inform you. Would you use the selected features in a trusted prediction model?

In [44]:
from sklearn.linear_model import Lasso

In [68]:
model = LogisticRegression(solver = "liblinear", penalty = 'l1').fit(x_train, y_train)
model.coef_

array([[-8.76249748e-01, -9.68611932e-01, -3.38350070e-01,
        -4.48903888e-01,  7.74558132e-01, -9.42033077e-01,
        -1.16968321e+00, -1.01064828e+00,  7.90166411e-01,
        -6.56549039e-01, -6.63516177e-01,  8.01283604e-01,
        -4.66195830e-01, -5.39523385e+00,  2.37242560e-03,
        -6.37975536e-07, -6.32183074e-04]])

In [69]:
#As the output shown above, not all variables are very close to 0. Therefore, the selected features can be applied in a trusted prediction model. 