In [None]:
## General Structure for all Notebooks 

## 1) Describe the problem being tackled, source of data, output 
## 2) Import libraries/set up wd; general working setup
## 3) Examine the quality of the data 
## 4) Build the right dataset 
## 5) Apply the structure

## Problem 
### What are we solving for? 

Build a ML model that predicts the probability that the first transaction of a new user is fraudulent.
You only have information about the user first transaction on the site and based on that
you have to make your classification

* **I suppose you would be looking for whether the IP address is within the right bounds for a country.**

For each user, determine their country based on the IP address


Build a model to predict whether an activity is fraudulent or not. 
Explain how different assumptions about the cost of false positives vs false negatives would impact the model


Your boss is a bit worried about using a model she doesn’t understand for something as important as fraud
detection. How would you explain her how the model is making the predictions? Not from a mathematical perspective
(she couldn’t care less about that), but from a user perspective. What kinds of users are more likely to be
classified as at risk? What are their characteristics?


Let’s say you now have this model which can be used live to predict in real time if an activity is fraudulent
or not. From a product perspective, how would you use it? That is, what kind of different user experiences
would you build based on the model output?

# Solution 

Random Forest model 

In [44]:
import pandas as pd
import numpy as np 
import os 
from datetime import datetime

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns
from termcolor import colored

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score

np.random.seed(4684)

wd = os.getcwd()
print(wd)

## Solution: https://product-data-science.datamasked.com/courses/496549/lectures/9194598 
## Github with Answers: https://github.com/JifuZhao/DS-Take-Home/blob/master/04.%20Identifying%20Fraudulent%20Activities.ipynb

/Users/annadudek/00_DataMasked


In [2]:
txns_raw=pd.read_csv("https://drive.google.com/uc?export=download&id=18RLruiMU8rM-IQPLdwL6wNEc8Kks2JZQ")
ips_raw=pd.read_csv("https://drive.google.com/uc?export=download&id=1wbKys6YI-IvE-b-C0_4xR4zz2YnpOL1d")


In [82]:
def describe_data(data):
    print(f" shape: {data.shape}")
    print('')
    print(data.dtypes)

    
    ## assume first id column in the dataset (in terms of left to right order)
    id_column = data.filter(like='id').columns.to_list()[0]
    if len(id_column) != 0:
        print('')
        print(f"Column taken as id: {id_column}")
        print(f"If {len(data)} = {len(data[id_column].unique())}, then dataframe is at this level")  
    else:    
        print('no id column')
        print(colored('############  Accuracy Metrics  ############','blue', attrs=['bold']))
    
    print('')
    return data.head(3)

In [84]:
def describe_data(data):
    print(f" shape: {data.shape}")
    print('')
    print(data.dtypes)

    
    ## assume first id column in the dataset (in terms of left to right order)
    try:
        id_column=data.filter(like='id').columns.to_list()[0]
        print('')
        print(f"Column taken as id: {id_column}")
        print(f"If {len(data)} = {len(data[id_column].unique())}, then dataframe is at this level") 

    except:
        print(colored('No id column','red', attrs=['bold']))

    
    print('')
    return data.head(3)

In [79]:
describe_data(txns_raw)


 shape: (151112, 17)

user_id                         int64
signup_time            datetime64[ns]
purchase_time          datetime64[ns]
purchase_value                  int64
device_id                      object
source                         object
browser                        object
sex                            object
age                             int64
ip_address                    float64
class                           int64
country                        object
time_between                    int64
same_day_purchase               int64
time_between_binned            object
signup_DOW                     object
purchase_DOW                   object
dtype: object

Column taken as id: user_id
If 151112 = 151112, then dataframe is at this level



Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,time_between,same_day_purchase,time_between_binned,signup_DOW,purchase_DOW
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,52,0,40-60 days,Tuesday,Saturday
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,0,1,,Sunday,Monday
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States,0,1,,Thursday,Thursday


In [85]:
describe_data(ips_raw)


 shape: (138846, 3)

lower_bound_ip_address    float64
upper_bound_ip_address      int64
country                    object
dtype: object
[1m[31mno id column[0m



Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China


In [56]:
# describe_data(ips_raw)
ips_raw.size

416538

## Assigning Country by IP Address 

In [6]:
## creates a list of NoneTypes the length of the df  
txns_raw_country = [None] * txns_raw.shape[0]
   
### For each entry in the txn_raws df, if the ip_address is greater than the lower bound ip address and less than the
### higher bound up address then assign the countries for which it meets the conditions.
## Then if there is only one country for which the row meets the conditions, input that country 

for i in range(txns_raw.shape[0]):
    tmp = ips_raw[(txns_raw['ip_address'][i] >= ips_raw['lower_bound_ip_address']) & 
                          (txns_raw['ip_address'][i] <= ips_raw['upper_bound_ip_address'])]['country'].values
    if (len(tmp) == 1):  
        txns_raw_country[i] = tmp
  

### assigns the values to the dataframe and gets rid of the squared brackets 
txns_raw['country'] = txns_raw_country
txns_raw['country'] = txns_raw['country'].str.get(0)

## Results
print(txns_raw.groupby('country').size().nlargest(10))
print('')
print(f" % of observations without a predicted country: {(sum(txns_raw.country.isnull())/len(txns_raw.country))*100}")

txns_raw['country'].fillna(value='Unknown', inplace=True)


#just keep the top 50 country, everything else is "other"; get countries from 51 to last one
bottom_countries = txns_raw.groupby('country').size().sort_values(ascending=False)[50:].index
x = dict.fromkeys(bottom_countries, 'Other')
txns_raw['country'] = txns_raw['country'].replace(x)

country
United States        58049
China                12038
Japan                 7306
United Kingdom        4490
Korea Republic of     4162
Germany               3646
France                3161
Canada                2975
Brazil                2961
Italy                 1944
dtype: int64

 % of observations without a predicted country: 14.53623802212928


## Predicting Fraudulent Behaviour

* Build a model to predict whether an activity is fraudulent or not. Explain how different assumptions about the cost of false positives vs false negatives would impact the model

1. Create features 
2. Random Forest 

In [7]:

## transform to dates should always run first because all date-like objects will be considered an object 
## and will be made into dummies, which will unnecessarily explode the number of dummies 

def transform_to_dates(data):

    to_transform_dates=data.filter(like='date').columns.to_list()
    if len(to_transform_dates) == 0:
        print('no objects named date; trying time')
        to_transform_dates=data.filter(like='time').columns.to_list()
        for i in to_transform_dates:
            data[i] = pd.to_datetime(data[i])
    else: 
        for i in to_transform_dates:
            data[i] = pd.to_datetime(data[i])
            
    print(data.dtypes)    
    return data

def make_dummies(data): 
    dummies_list = []
    variables=data.select_dtypes('object').columns.tolist()
    print(variables)
    
    
    for var in variables:   
        dummies_list = pd.get_dummies(data[var]).rename(columns=lambda x: str(var) + '_' + str(x))
        data=data.join(dummies_list)
#         del dummies_list
    
    return data    
    

In [8]:
## deal with dates // additional variables 

transform_to_dates(data = txns_raw)
txns_raw['time_between']= (txns_raw['purchase_time']-txns_raw['signup_time']).dt.days
txns_raw['same_day_purchase'] = np.where(txns_raw['time_between'] <=1, 1, 0)

bins = [0, 20, 40, 60, 80, 100, 120]
labels = ['0-20 days','20-40 days','40-60 days','60-80 days', '80-100 days', '100+ days', ]
txns_raw['time_between_binned']=(pd.cut(txns_raw['time_between'], bins=bins, labels=labels)).astype(object)

txns_raw['signup_DOW']=txns_raw['signup_time'].dt.day_name()
txns_raw['purchase_DOW']=txns_raw['purchase_time'].dt.day_name()


## make dummies 
txns_final=make_dummies(data = txns_raw[txns_raw.columns[txns_raw.columns!='device_id'] ])   ## want to omit device id as dummy
txns_final=txns_final.join(txns_raw['device_id'])


## additional features 

#check how for each device id, how many different users had it
txns_final['device_id_count'] = txns_final.groupby('device_id')['device_id'].transform('count')
txns_final['ip_address_count'] = txns_final.groupby('ip_address')['ip_address'].transform('count')

no objects named date; trying time
user_id                    int64
signup_time       datetime64[ns]
purchase_time     datetime64[ns]
purchase_value             int64
device_id                 object
source                    object
browser                   object
sex                       object
age                        int64
ip_address               float64
class                      int64
country                   object
dtype: object
['source', 'browser', 'sex', 'country', 'time_between_binned', 'signup_DOW', 'purchase_DOW']


## Model Building - Random Forest 

In [36]:
train, test = train_test_split(txns_final, test_size=0.2, random_state=42)
print(train.shape)
print(test.shape)

y_actual = 'class'


variables_to_use = [
#  'user_id',
#  'signup_time',
#  'purchase_time',
 'purchase_value',
#  'source',
#  'browser',
#  'sex',
#  'age',
#  'ip_address',
#  'class',
#  'country',
 'time_between',
 'same_day_purchase',
#  'time_between_binned',
#  'signup_DOW',
#  'purchase_DOW',
 'source_Ads',
 'source_Direct',
 'source_SEO',
 'browser_Chrome',
 'browser_FireFox',
 'browser_IE',
 'browser_Opera',
 'browser_Safari',
 'sex_F',
 'sex_M',
 'country_Argentina',
 'country_Australia',
 'country_Austria',
 'country_Belgium',
 'country_Brazil',
 'country_Canada',
 'country_Chile',
 'country_China',
 'country_Colombia',
 'country_Czech Republic',
 'country_Denmark',
 'country_Egypt',
 'country_European Union',
 'country_Finland',
 'country_France',
 'country_Germany',
 'country_Greece',
 'country_Hong Kong',
 'country_Hungary',
 'country_India',
 'country_Indonesia',
 'country_Iran (ISLAMIC Republic Of)',
 'country_Ireland',
 'country_Israel',
 'country_Italy',
 'country_Japan',
 'country_Korea Republic of',
 'country_Malaysia',
 'country_Mexico',
 'country_Netherlands',
 'country_New Zealand',
 'country_Norway',
 'country_Other',
 'country_Poland',
 'country_Portugal',
 'country_Romania',
 'country_Russian Federation',
 'country_Saudi Arabia',
 'country_South Africa',
 'country_Spain',
 'country_Sweden',
 'country_Switzerland',
 'country_Taiwan; Republic of China (ROC)',
 'country_Thailand',
 'country_Turkey',
 'country_Ukraine',
 'country_United Kingdom',
 'country_United States',
 'country_Unknown',
 'country_Venezuela',
 'country_Viet Nam',
 'time_between_binned_0-20 days',
 'time_between_binned_100+ days',
 'time_between_binned_20-40 days',
 'time_between_binned_40-60 days',
 'time_between_binned_60-80 days',
 'time_between_binned_80-100 days',
 'signup_DOW_Friday',
 'signup_DOW_Monday',
 'signup_DOW_Saturday',
 'signup_DOW_Sunday',
 'signup_DOW_Thursday',
 'signup_DOW_Tuesday',
 'signup_DOW_Wednesday',
 'purchase_DOW_Friday',
 'purchase_DOW_Monday',
 'purchase_DOW_Saturday',
 'purchase_DOW_Sunday',
 'purchase_DOW_Thursday',
 'purchase_DOW_Tuesday',
 'purchase_DOW_Wednesday',
#  'device_id',
 'device_id_count',
 'ip_address_count'
]

# X = train[variables_to_use]
# y = train['class']
# z = test['class']

X_train = train[variables_to_use]
y_train = train[y_actual]

X_test = test[variables_to_use]
y_test = test[y_actual]

(120889, 100)
(30223, 100)


In [37]:
## Build model 

n = 6
randoModel = RandomForestClassifier(n_estimators=100, max_features=n, oob_score=True)
randoModel.fit(X_train,y_train)

y_pred = randoModel.predict(X_test)
test['y_hat']= randoModel.predict(X_test)

In [52]:
def random_forest_results(model): 

    print(colored('############  Accuracy Metrics  ############','blue', attrs=['bold']))
    print('')
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print ("Out of Bag Score:", model.oob_score_)



## which features are important 
    print('')
    print(colored('############ Feature Importance  ############','blue', attrs=['bold']))
    feature_imp = pd.Series(model.feature_importances_,
                        index=variables_to_use).sort_values(ascending=False)
    print('')
    print('Feature Importance sorted - top n features: ')
    print(feature_imp[0:n])

 
    

## confusion matrix 
    print('')
    print(colored('############ Confusion Matrix  ############','blue', attrs=['bold']))
    confusion_matrix(test[y_actual], test['y_hat']) 

    df_confusion = pd.crosstab(test[y_actual], test['y_hat'],
                           rownames=['Actual'], colnames=['Predicted'], margins=True)
    print('')
    print('confusion matrix - absolute values')
    print(df_confusion)

    print('')
    print('confusion matrix - % values')
    print(df_confusion / df_confusion.sum(axis=1))       ### represented as a percentage 


## Precision v Recall 
    print('')
    print('############ Precision v Recall ############')
    precision, recall, fscore, support = score(test[y_actual], test['y_hat'],labels=[0,1])


## Precision: % of results that are TP of all the records identified positively. TP/(TP + FP) 
## Recall: % of results that are TP from all records that are TP and which were missed as TP (FN). TP/(TP + FN)
## Accuracy: % of all positives and negatives identified correctly (TP + TN)/total records
## F1-Score: 
## Support: 


    classification_report= pd.DataFrame(columns = ('metric', 'not_converted', 'converted'))

    classification_report.loc[0] = ['precision', precision[0], precision[1]]
    classification_report.loc[1] = ['recall', recall[0], recall[1]]
    classification_report.loc[2] = ['fscore', fscore[0], fscore[1]]
    classification_report.loc[3] = ['support', support[0], support[1]]

    print(classification_report)


In [53]:
random_forest_results(model=randoModel)

[1m[34m############  Accuracy Metrics  ############[0m

Accuracy: 0.9547033716044072
Out of Bag Score: 0.9549504090529328

[1m[34m############ Feature Importance  ############[0m

Feature Importance sorted- top n features: 
time_between                      0.202308
device_id_count                   0.151198
ip_address_count                  0.146892
purchase_value                    0.111399
same_day_purchase                 0.096051
time_between_binned_20-40 days    0.011431
dtype: float64

[1m[34m############ Confusion Matrix  ############[0m

confusion matrix - absolute values
Predicted      0     1    All
Actual                       
0          27310    63  27373
1           1306  1544   2850
All        28616  1607  30223

confusion matrix - % values
Predicted         0         1      All
Actual                                
0          0.498849  0.011053  0.45285
1          0.023856  0.270877  0.04715
All        0.522705  0.281930  0.50000

############ Precision v Rec