In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import datetime as dt
import pandas as pd

#  Import and read the fraudtrain.csv.
url = 'https://sf-uncc-final.s3.amazonaws.com/fraudTrain.csv'
ccf_df = pd.read_csv(url,parse_dates=['trans_date_trans_time',])


# Drop any null columns
ccf_df=ccf_df.dropna(axis='columns', how='all')
ccf_df.drop_duplicates(inplace=True)

# all null rows
ccf_df= ccf_df.dropna()

# Preview 
ccf_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [3]:
# Extract frequencies of transactions in last 1,7, & 30 days
def last1DayTransactionCount(x):
    temp = pd.Series(x.index, index = x.trans_date_trans_time, name='count_1_day').sort_index()
    count_1_day = temp.rolling('1d').count() - 1
    count_1_day.index = temp.values
    x['count_1_day'] = count_1_day.reindex(x.index)
    return x
def last7DaysTransactionCount(x):
    temp = pd.Series(x.index, index = x.trans_date_trans_time, name='count_7_days').sort_index()
    count_7_days = temp.rolling('7d').count() - 1
    count_7_days.index = temp.values
    x['count_7_days'] = count_7_days.reindex(x.index)
    return x
def last30DaysTransactionCount(x):
    temp = pd.Series(x.index, index = x.trans_date_trans_time, name='count_30_days').sort_index()
    count_30_days = temp.rolling('30d').count() - 1
    count_30_days.index = temp.values
    x['count_30_days'] = count_30_days.reindex(x.index)
    return x

In [4]:
ccf_df1 = ccf_df.groupby('cc_num').apply(last1DayTransactionCount)

In [5]:
ccf_df = ccf_df.groupby('cc_num').apply(last7DaysTransactionCount)

In [6]:
ccf_df = ccf_df.groupby('cc_num').apply(last30DaysTransactionCount)

In [7]:
ccf_df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,count_7_days,count_30_days
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0,0.0,0.0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0,0.0,0.0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0,0.0,0.0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0,0.0,0.0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0,0.0,0.0


In [8]:
# Generate our categorical variable list
ccf_cat = ccf_df.dtypes[ccf_df.dtypes == "object"].index.tolist()

In [9]:
# Check the number of unique values in each column
ccf_df[ccf_cat].nunique()

merchant         693
category          14
first            352
last             481
gender             2
street           983
city             894
state             51
job              494
dob              968
trans_num    1296675
dtype: int64

In [10]:
# Update fields to better reflect more legible items
ccf_df['age'] = dt.date.today().year-pd.to_datetime(ccf_df['dob']).dt.year
ccf_df['hour'] = pd.to_datetime(ccf_df['trans_date_trans_time']).dt.hour
ccf_df['day'] = pd.to_datetime(ccf_df['trans_date_trans_time']).dt.dayofweek
ccf_df['month'] = pd.to_datetime(ccf_df['trans_date_trans_time']).dt.month

# Build new DF with targeted features
ccf_df2 = ccf_df.drop(columns = ['merch_lat','merch_long','lat','long','unix_time','cc_num','Unnamed: 0','trans_date_trans_time','merchant','dob', 'first', 'last', 'street', 'trans_num', 'city', 'job'])

# Preview DF
ccf_df2.head()

Unnamed: 0,category,amt,gender,state,zip,city_pop,is_fraud,count_7_days,count_30_days,age,hour,day,month
0,misc_net,4.97,F,NC,28654,3495,0,0.0,0.0,34,0,1,1
1,grocery_pos,107.23,F,WA,99160,149,0,0.0,0.0,44,0,1,1
2,entertainment,220.11,M,ID,83252,4154,0,0.0,0.0,60,0,1,1
3,gas_transport,45.0,M,MT,59632,1939,0,0.0,0.0,55,0,1,1
4,misc_pos,41.96,M,VA,24433,99,0,0.0,0.0,36,0,1,1


In [11]:
# Create a function to label states by region
def label_states (row):
    if row['state'] in ['VT', 'ME', 'NH', 'MA', 'RI', 'CT','NY','PA','NJ']:
        return 'North-East'
    if row['state'] in ['WI','MI','IL','IN','OH','ND','SD','NE','KS','MN','IA','MO']:
        return 'Mid-West'
    if row['state'] in ['DE','MD','DC','VA','WV','NC','SC','GA','FL','KY','TN','MS','AL','AR','LA']:
        return 'South-East'
    if row['state'] in ['OK','TX','NM','AZ']:
        return 'South-West'
    if row['state'] in ['CA','NV','UT','CO','WY','ID','OR','WA','MT','AK','HI']:
        return 'West'
    return 'etc'

In [12]:
# Add regions to each row of data
ccf_df2['Region']=ccf_df2.apply(lambda row: label_states(row), axis=1)

In [13]:
# Drop the state column
ccf_df2 = ccf_df2.drop(columns = ['state'])

# Preview df
ccf_df2.head()

Unnamed: 0,category,amt,gender,zip,city_pop,is_fraud,count_7_days,count_30_days,age,hour,day,month,Region
0,misc_net,4.97,F,28654,3495,0,0.0,0.0,34,0,1,1,South-East
1,grocery_pos,107.23,F,99160,149,0,0.0,0.0,44,0,1,1,West
2,entertainment,220.11,M,83252,4154,0,0.0,0.0,60,0,1,1,West
3,gas_transport,45.0,M,59632,1939,0,0.0,0.0,55,0,1,1,West
4,misc_pos,41.96,M,24433,99,0,0.0,0.0,36,0,1,1,South-East


In [14]:
# Check the types of data in the columns
ccf_df2.dtypes

category          object
amt              float64
gender            object
zip                int64
city_pop           int64
is_fraud           int64
count_7_days     float64
count_30_days    float64
age                int64
hour               int64
day                int64
month              int64
Region            object
dtype: object

In [15]:
ccf_df.corr()['is_fraud'].abs().sort_values(ascending=False)

is_fraud         1.000000
amt              0.219404
count_30_days    0.046523
count_7_days     0.036301
hour             0.013799
month            0.012409
age              0.012378
unix_time        0.005078
Unnamed: 0       0.004767
zip              0.002162
city_pop         0.002136
lat              0.001894
merch_lat        0.001741
day              0.001739
merch_long       0.001721
long             0.001721
cc_num           0.000981
Name: is_fraud, dtype: float64

In [16]:
# Convert category, gender & region to dummy variables
df=pd.get_dummies(ccf_df2, drop_first=True)
df

Unnamed: 0,amt,zip,city_pop,is_fraud,count_7_days,count_30_days,age,hour,day,month,...,category_misc_pos,category_personal_care,category_shopping_net,category_shopping_pos,category_travel,gender_M,Region_North-East,Region_South-East,Region_South-West,Region_West
0,4.97,28654,3495,0,0.0,0.0,34,0,1,1,...,0,0,0,0,0,0,0,1,0,0
1,107.23,99160,149,0,0.0,0.0,44,0,1,1,...,0,0,0,0,0,0,0,0,0,1
2,220.11,83252,4154,0,0.0,0.0,60,0,1,1,...,0,0,0,0,0,1,0,0,0,1
3,45.00,59632,1939,0,0.0,0.0,55,0,1,1,...,0,0,0,0,0,1,0,0,0,1
4,41.96,24433,99,0,0.0,0.0,36,0,1,1,...,1,0,0,0,0,1,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1296670,15.56,84735,258,0,21.0,86.0,61,12,6,6,...,0,0,0,0,0,1,0,0,0,1
1296671,51.70,21790,100,0,6.0,30.0,43,12,6,6,...,0,0,0,0,0,1,0,1,0,0
1296672,105.93,88325,899,0,36.0,139.0,55,12,6,6,...,0,0,0,0,0,1,0,0,1,0
1296673,74.90,57756,1126,0,24.0,133.0,42,12,6,6,...,0,0,0,0,0,1,0,0,0,0


In [17]:
# Split our preprocessed data into our features and target arrays
y = df["is_fraud"].values
X = df.drop(["is_fraud"], axis='columns').values

#Scale the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=13)

## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report

# Create the random forest classifier
model2 = RandomForestClassifier(random_state=13)

# Fit the model
model2.fit(X_train,y_train)

# Make predictions using the model
y_pred=model2.predict(X_test)

# Calculate and display the classification report and Confustion Matrix
print("Classification Report")
report = classification_report(y_test, y_pred)
print(report)
print("Confusion Matrix")
matrix = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(
    matrix, index=["True Valid", "True Fraud"], columns=["Predicted Valid", "Predicted Fraud"])
cm_df

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    322272
           1       0.98      0.76      0.86      1897

    accuracy                           1.00    324169
   macro avg       0.99      0.88      0.93    324169
weighted avg       1.00      1.00      1.00    324169

Confusion Matrix


Unnamed: 0,Predicted Valid,Predicted Fraud
True Valid,322242,30
True Fraud,449,1448
