# FLAML - Team Goal

This notebook is used for the appliation of ML algorithms to the Principal Components from the Ethereum, Credit Card, and Insurance datasets.

In [12]:
import pandas as pd
import numpy as np
import flaml
from flaml import AutoML

In [13]:
# read in the data for modeling
data = pd.read_pickle('Data/principal_components.pkl')
print(data.shape)
data.head()

(71250, 15)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,value
0,-2.006815,-2.169782,0.1464,0.467598,-0.327913,0.751913,-0.521793,-0.761362,0.78803,0.315258,-0.054779,0.023103,-0.001547,1.587008e-17,-0.028777
1,-1.352242,-1.128665,-1.880299,0.636689,-0.067591,0.997769,-0.122053,-0.728717,0.883459,-0.04829,0.590403,0.036177,-0.002929,-1.502062e-16,-0.035048
2,-1.678475,-2.004593,-0.426661,0.696365,-0.14081,0.824183,-0.329805,-0.819126,0.71446,-0.011513,0.009796,0.033345,-0.001582,-2.3956130000000002e-17,-0.035048
3,-1.505144,-1.625462,-1.122661,0.62657,-0.059864,0.908407,-0.226611,-0.79167,0.777155,-0.030135,-0.182153,0.033123,-0.000876,5.0636980000000003e-17,-0.035048
4,-1.487477,-1.586732,-1.193951,0.619052,-0.051163,0.917079,-0.216019,-0.788997,0.783427,-0.032049,-0.205875,0.033085,-0.000792,5.93106e-17,-0.035048


In [14]:
# get a single column indicating if a transaction is fraud as our target
to_scam = np.array(pd.read_pickle('Data/to_scam.pkl'))
from_scam = np.array(pd.read_pickle('Data/from_scam.pkl'))

temp = to_scam + from_scam
scam =[]
for i in temp:
    if i==0:
        scam.append(0)
    else:
        scam.append(1)
target = pd.DataFrame(scam)
target

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
71245,1
71246,1
71247,1
71248,1


## Ethereum

In [4]:
from flaml import AutoML
automl = AutoML()
automl.fit(np.array(data), np.array(target), task="classification", estimator_list = ['lgbm','xgboost','rf'], metric ='log_loss' , max_iter = 10000, time_budget = None, ensemble= True)

[flaml.automl: 04-05 12:31:32] {2055} INFO - task = classification
[flaml.automl: 04-05 12:31:32] {2057} INFO - Data split method: stratified
[flaml.automl: 04-05 12:31:32] {2061} INFO - Evaluation method: holdout
[flaml.automl: 04-05 12:31:32] {2142} INFO - Minimizing error metric: log_loss
[flaml.automl: 04-05 12:31:32] {2200} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost', 'rf']
[flaml.automl: 04-05 12:31:32] {2453} INFO - iteration 0, current learner lgbm
  import pandas.util.testing as tm
[flaml.automl: 04-05 12:31:33] {2569} INFO - Estimated sufficient time budget=39170s. Estimated necessary time budget=39s.
[flaml.automl: 04-05 12:31:33] {2621} INFO -  at 0.7s,	estimator lgbm's best error=0.3624,	best estimator lgbm's best error=0.3624
[flaml.automl: 04-05 12:31:33] {2453} INFO - iteration 1, current learner lgbm
[flaml.automl: 04-05 12:31:33] {2621} INFO -  at 0.7s,	estimator lgbm's best error=0.3624,	best estimator lgbm's best error=0.3624
[flaml.automl: 04-05 12

In [10]:
# save the best configuration of each type of model
import pickle

eth_xgboost_mod = automl.best_model_for_estimator('xgboost')
eth_lgbm_mod = automl.best_model_for_estimator('lgbm')
eth_rf_mod = automl.best_model_for_estimator('rf')
eth_automl = automl

pickle.dump(eth_xgboost_mod, open('Models/eth_xg.pkl','wb'))
pickle.dump(eth_lgbm_mod, open('Models/eth_lgbm.pkl','wb'))
pickle.dump(eth_rf_mod, open('Models/eth_rf.pkl','wb'))
pickle.dump(eth_automl, open('Models/eth_automl.pkl','wb'))

## Credit Card

In [109]:
# read in the data for modeling and remove unneeded columns
data_credit = pd.read_csv('creditcard.csv')
print(data_credit.shape)
data_credit.pop('Time')
target_class = data_credit.pop('Class')
data_credit.head()

(284807, 31)


Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99


In [14]:
# standardize value column
import sklearn
from sklearn.preprocessing import StandardScaler

transformer = StandardScaler().fit(np.array(data_credit['Amount']).reshape(-1,1))
transformed_value = transformer.transform(np.array(data_credit['Amount']).reshape(-1,1)).reshape(1,284807)[0]
data_credit['Amount']=transformed_value
data_credit.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0.244964
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,-0.342475
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,1.160686
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0.140534
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,-0.073403


In [15]:
automl_credit = AutoML()
automl_credit.fit(np.array(data_credit), np.array(target_class), task="classification", estimator_list = ['lgbm','xgboost','rf'], metric ='log_loss' , max_iter = 10000, time_budget = None, ensemble= True)

[flaml.automl: 04-05 12:41:22] {2055} INFO - task = classification
[flaml.automl: 04-05 12:41:22] {2057} INFO - Data split method: stratified
[flaml.automl: 04-05 12:41:22] {2061} INFO - Evaluation method: holdout
[flaml.automl: 04-05 12:41:22] {2142} INFO - Minimizing error metric: log_loss
[flaml.automl: 04-05 12:41:22] {2200} INFO - List of ML learners in AutoML Run: ['lgbm', 'xgboost', 'rf']
[flaml.automl: 04-05 12:41:22] {2453} INFO - iteration 0, current learner lgbm
[flaml.automl: 04-05 12:41:22] {2569} INFO - Estimated sufficient time budget=8753s. Estimated necessary time budget=9s.
[flaml.automl: 04-05 12:41:22] {2621} INFO -  at 0.4s,	estimator lgbm's best error=0.0076,	best estimator lgbm's best error=0.0076
[flaml.automl: 04-05 12:41:22] {2453} INFO - iteration 1, current learner lgbm
[flaml.automl: 04-05 12:41:22] {2621} INFO -  at 0.5s,	estimator lgbm's best error=0.0076,	best estimator lgbm's best error=0.0076
[flaml.automl: 04-05 12:41:22] {2453} INFO - iteration 2, cu

In [16]:
# save the best configuration of each type of model
credit_xgboost_mod = automl_credit.best_model_for_estimator('xgboost')
credit_lgbm_mod = automl_credit.best_model_for_estimator('lgbm')
credit_rf_mod = automl_credit.best_model_for_estimator('rf')

pickle.dump(credit_xgboost_mod, open('Models/credit_xg.pkl','wb'))
pickle.dump(credit_lgbm_mod, open('Models/credit_lgbm.pkl','wb'))
pickle.dump(credit_rf_mod, open('Models/credit_rf.pkl','wb'))
pickle.dump(automl_credit, open('Models/credit_automl.pkl','wb'))

# Insurance

In [3]:
# read in the data for modeling and remove unneeded columns
data_insurance = pd.read_csv('insurance_claims.csv')
print(data_insurance.shape)
target_class = data_credit.pop('fraud_reported')
data_insurance.head()

(1000, 40)


Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,...,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,fraud_reported,_c39
0,328,48,521585,2014-10-17,OH,250/500,1000,1406.91,0,466132,...,YES,71610,6510,13020,52080,Saab,92x,2004,Y,
1,228,42,342868,2006-06-27,IN,250/500,2000,1197.22,5000000,468176,...,?,5070,780,780,3510,Mercedes,E400,2007,Y,
2,134,29,687698,2000-09-06,OH,100/300,2000,1413.14,5000000,430632,...,NO,34650,7700,3850,23100,Dodge,RAM,2007,N,
3,256,41,227811,1990-05-25,IL,250/500,2000,1415.74,6000000,608117,...,NO,63400,6340,6340,50720,Chevrolet,Tahoe,2014,Y,
4,228,44,367455,2014-06-06,IL,500/1000,1000,1583.91,6000000,610706,...,NO,6500,1300,650,4550,Accura,RSX,2009,N,


# Evaluation
Evaluate the best model configuration for each type of dataset using repeated cross validation. Record the four metrics: F1, Recall, Precision, Roc_auc

In [32]:
import pickle
import pandas as pd
# import best model configurations
eth = pd.read_pickle(r'Models/eth_automl.pkl')
credit = pd.read_pickle(r'Models/credit_automl.pkl')
#insurance = pd.read_pickle(r'Models/insurance_automl.pkl)

## Ethereum

In [97]:
from sklearn.model_selection import cross_validate
# 5-fold cross validation with 25 repeats
cv = RepeatedKFold(n_splits=5, n_repeats=25, random_state=1)

# read in best ethereum model configuration and compute metrics on cross-validation splits/repetitions
eth_mod = pd.read_pickle(r'Models/eth_lgbm.pkl')
eth_scores = cross_validate(eth_mod,data,np.ravel(target),scoring=['f1','recall','precision','roc_auc'],cv=cv)

In [131]:
# save average metrics across all splits and repeats
eth_f1_mean = np.mean(eth_scores['test_f1'])
eth_recall_mean = np.mean(eth_scores['test_recall'])
eth_precision_mean = np.mean(eth_scores['test_precision'])
eth_roc_mean = np.mean(eth_scores['test_roc_auc'])

eth_f1_var = np.var(eth_scores['test_f1'])
eth_recall_var = np.var(eth_scores['test_recall'])
eth_precision_var = np.var(eth_scores['test_precision'])
eth_roc_var = np.var(eth_scores['test_roc_auc'])

eth_f1_max = np.max(eth_scores['test_f1'])
eth_recall_max = np.max(eth_scores['test_recall'])
eth_precision_max = np.max(eth_scores['test_precision'])
eth_roc_max = np.max(eth_scores['test_roc_auc'])

eth_f1_min = np.min(eth_scores['test_f1'])
eth_recall_min = np.min(eth_scores['test_recall'])
eth_precision_min = np.min(eth_scores['test_precision'])
eth_roc_min = np.min(eth_scores['test_roc_auc'])

## Credit Card

In [110]:
# 5-fold cross validation with 25 repeats
cv = RepeatedKFold(n_splits=5, n_repeats=25, random_state=1)

# read in best ethereum model configuration and compute metrics on cross-validation splits/repetitions
credit_mod = pd.read_pickle(r'Models/credit_xg.pkl')
credit_scores = cross_validate(credit_mod,data_credit,np.ravel(target_class),scoring=['f1','recall','precision','roc_auc'],cv=cv)

In [135]:
# save average metrics across all splits and repeats
credit_f1_mean = np.mean(credit_scores['test_f1'])
credit_recall_mean = np.mean(credit_scores['test_recall'])
credit_precision_mean = np.mean(credit_scores['test_precision'])
credit_roc_mean = np.mean(credit_scores['test_roc_auc'])

credit_f1_var = np.var(credit_scores['test_f1'])
credit_recall_var = np.var(credit_scores['test_recall'])
credit_precision_var = np.var(credit_scores['test_precision'])
credit_roc_var = np.var(credit_scores['test_roc_auc'])

credit_f1_max = np.max(credit_scores['test_f1'])
credit_recall_max = np.max(credit_scores['test_recall'])
credit_precision_max = np.max(credit_scores['test_precision'])
credit_roc_max = np.max(credit_scores['test_roc_auc'])

credit_f1_min = np.min(credit_scores['test_f1'])
credit_recall_min = np.min(credit_scores['test_recall'])
credit_precision_min = np.min(credit_scores['test_precision'])
credit_roc_min = np.min(credit_scores['test_roc_auc'])

## Insurance

# Analysis

In [140]:
# get results for ethereum data

eth_results = pd.DataFrame()
eth_results['F1'] = [eth_f1_mean,eth_f1_var,eth_f1_max,eth_f1_min]
eth_results['Recall'] = [eth_recall_mean,eth_recall_var,eth_recall_max,eth_recall_min]
eth_results['Precision'] = [eth_precision_mean,eth_precision_var,eth_precision_max,eth_precision_min]
eth_results['Roc'] = [eth_roc_mean,eth_roc_var,eth_roc_max,eth_roc_min]
eth_results.rename(index={0:'Mean',1:'Variance',2:'Max',3:'Min'})

Unnamed: 0,F1,Recall,Precision,Roc
Mean,0.926644,0.872837,0.987541,0.986233
Variance,9e-06,2.5e-05,5e-06,2e-06
Max,0.935786,0.890797,0.99375,0.989159
Min,0.919792,0.860667,0.980392,0.981142


In [141]:
# get results for credit card data 

credit_results = pd.DataFrame()
credit_results['F1'] = [credit_f1_mean,credit_f1_var,credit_f1_max,credit_f1_min]
credit_results['Recall'] = [credit_recall_mean,credit_recall_var,credit_recall_max,credit_recall_min]
credit_results['Precision'] = [credit_precision_mean,credit_precision_var,credit_precision_max,credit_precision_min]
credit_results['Roc'] = [credit_roc_mean,credit_roc_var,credit_roc_max,credit_roc_min]
credit_results.rename(index={0:'Mean',1:'Variance',2:'Max',3:'Min'})

Unnamed: 0,F1,Recall,Precision,Roc
Mean,0.862059,0.793792,0.944961,0.98026
Variance,0.000671,0.00169,0.000579,6.3e-05
Max,0.93617,0.897959,0.988764,0.998594
Min,0.795455,0.686275,0.880952,0.95489
