# Introduction to Catching Fraud

In [1]:
# import python libraries
import folium
import pandas as pd
import numpy as np

### Load data from file into dataframe

In [97]:
data_pd = pd.read_csv('gwc-risk-dataset.csv')
print(data_pd.shape)

(100000, 9)


In [98]:
data_pd.head()

Unnamed: 0,transaction_id,payment_type,disbursement_type,recipient_country,send_amount,application,sender_device_latitude,sender_device_longitude,fraud_type
0,1,credit card,cash delivery,CR,7833.77,mobile app,29.584452,-81.20787,
1,2,credit card,cash delivery,CR,3160.53,web,36.095692,-79.437799,
2,3,ach,cash pickup,HU,13533.96,web,39.896182,-104.981147,
3,4,credit card,bill payment,SE,83.36,web,30.627977,-96.334407,unauthorized payment source
4,5,ach,cash pickup,RO,11617.6,mobile app,39.016951,-94.281615,


In [99]:
data_pd['payment_type'].unique()

array(['credit card', 'ach'], dtype=object)

In [100]:
data_pd['disbursement_type'].unique()

array(['cash delivery', 'cash pickup', 'bill payment', 'bank deposit'],
      dtype=object)

In [101]:
data_pd['application'].unique()

array(['mobile app', 'web'], dtype=object)

In [102]:
data_pd['recipient_country'].unique()

array(['CR', 'HU', 'SE', 'RO', 'BO', 'SI', 'NZ', 'VN', 'ES', 'PY', 'BG',
       'SK', 'CY', 'LV', 'EG', 'MX', 'CH', 'NP', 'HN', 'HR', 'LT', 'HK',
       'EC', 'UY', 'LU', 'BD', 'PH', 'GT', 'NO', 'PK', 'MT', 'PA', 'FI',
       'BR', 'SG', 'DE', 'DO', 'IN', 'PL', 'MA', 'GB', 'SV', 'TT', 'IT',
       'GH', 'ZA', 'JM', 'JP', 'DK', 'NI', 'KE', 'CA', 'CZ', 'AU', 'GY',
       'CO', 'PT', 'BE', 'CN', 'TN', 'HT', 'NG', 'EE', 'AT', 'PE', 'FR',
       'ID', 'CL', 'NL', 'IE', 'GR', 'AR', 'LK'], dtype=object)

In [103]:
data_pd['fraud_type'].unique()

array(['None', 'unauthorized payment source', 'account take over',
       'not sufficient funds'], dtype=object)

### Data manipulation

In [104]:
# create list of unique latitude, longitude pairs in our dataset
locations = data_pd[['sender_device_latitude', 'sender_device_longitude']].drop_duplicates(subset=['sender_device_latitude', 'sender_device_longitude'])
locations.head()

Unnamed: 0,sender_device_latitude,sender_device_longitude
0,29.584452,-81.20787
1,36.095692,-79.437799
2,39.896182,-104.981147
3,30.627977,-96.334407
4,39.016951,-94.281615


In [105]:
# How many locations are there?
len(locations)

1097

## Calculate fraud rate for every location and plot on a map

In [106]:
location_stats = []
for index, location in locations.iterrows():
    latitude = location['sender_device_latitude']
    longitude = location['sender_device_longitude']
    # create new dataframe with only one lat/long location
    loc_data = data_pd[(data_pd['sender_device_latitude'] == latitude) & (data_pd['sender_device_longitude'] == longitude)]
    # find the count of transactions that are fraudulent
    count_fraud_data = loc_data[loc_data['fraud_type'] != 'None'].shape[0]
    # find the count of transactions that are not fraudulent
    count_data = loc_data.shape[0]
    # calculate the rate of fraud for this location
    location_stats.append([latitude, longitude, float(count_fraud_data)/float(count_data)])

In [107]:
style=['OpenStreetMap','Stamen Terrain','Stamen Toner','Mapbox Bright']
Lat_Long_US = [45.7832641,-108.5709021]
m = folium.Map(location=Lat_Long_US, zoom_start=3, tiles=style[0])
m

In [108]:
# For every location, plot a circle on the map
for latitude, longitude, fraud_rate in location_stats:
    if fraud_rate < 0.1:
        color = 'green'
    elif fraud_rate < 0.5:
        color = 'yellow'
    elif fraud_rate < 0.8:
        color = 'orange'
    else:
        color = 'red'
    folium.CircleMarker(
        location=[latitude, longitude],
        popup='%.2f percent fraud rate' % fraud_rate,
        color=color,
        radius = 10,
        fill=True,
        fill_color=color
    ).add_to(m)
    
m

## How much fraud is there?

In [89]:
pd.crosstab(index=data_pd['fraud_type'], columns='Count')

col_0,Count
fraud_type,Unnamed: 1_level_1
,79238
account take over,6805
not sufficient funds,195
unauthorized payment source,13762


In [90]:
pd.crosstab(index=data_pd['application'], columns=data_pd['fraud_type'])

fraud_type,None,account take over,not sufficient funds,unauthorized payment source
application,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
mobile app,39619,3389,96,1867
web,39619,3416,99,11895


In [91]:
pd.crosstab(index=data_pd['payment_type'], columns=data_pd['fraud_type'])

fraud_type,None,account take over,not sufficient funds,unauthorized payment source
payment_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
ach,39598,3441,102,1905
credit card,39640,3364,93,11857


In [92]:
pd.crosstab(index=data_pd['disbursement_type'], columns=data_pd['fraud_type'])

fraud_type,None,account take over,not sufficient funds,unauthorized payment source
disbursement_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank deposit,19597,82,36,97
bill payment,20037,77,53,10052
cash delivery,19744,76,31,98
cash pickup,19860,6570,75,3515


In [93]:
data_pd['is_fraud'] = data_pd['fraud_type'].apply(lambda x: 0 if x == 'None' else 1)
pd.crosstab(index=data_pd['disbursement_type'], columns=data_pd['is_fraud'])

is_fraud,0,1
disbursement_type,Unnamed: 1_level_1,Unnamed: 2_level_1
bank deposit,19597,215
bill payment,20037,10182
cash delivery,19744,205
cash pickup,19860,10160


In [94]:
country_data = pd.crosstab(index=data_pd['recipient_country'], columns=data_pd['is_fraud'])
country_data.head()

is_fraud,0,1
recipient_country,Unnamed: 1_level_1,Unnamed: 2_level_1
AR,1165,140
AT,1118,140
AU,1121,155
BD,1077,153
BE,1061,137


### What country has the most fraudulent transactions?

In [95]:
country_data.loc[country_data[1].idxmax()]

is_fraud
0     1024
1    10012
Name: SI, dtype: int64

In [96]:
country_data.sort_values(by=[1], ascending=False)

is_fraud,0,1
recipient_country,Unnamed: 1_level_1,Unnamed: 2_level_1
SI,1024,10012
DO,1052,262
JM,1126,186
MX,1098,176
BR,1116,176
HU,1098,176
ID,1074,173
CR,1068,172
NI,1095,168
CN,1047,168


### Can you narrow down the fraud trend for SI?

In [None]:
# hint, try running similar lines from above like crosstab and group by to see fraud 
# trends on the subset data, country_data.