## Library

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 500)

## Data

In [0]:
data = pd.read_csv('challenge.csv')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 29 columns):
x1             10000 non-null object
x2             10000 non-null object
x3             10000 non-null object
x4             10000 non-null object
x5             10000 non-null object
x6             10000 non-null int64
y              10000 non-null float64
x7             10000 non-null int64
x8             10000 non-null object
x9             10000 non-null int64
x10            10000 non-null int64
x11            10000 non-null float64
x12            10000 non-null float64
x13            10000 non-null float64
x14            10000 non-null int64
x15            10000 non-null object
x16            10000 non-null object
x17            10000 non-null object
x18            10000 non-null int64
x19            10000 non-null int64
x20            10000 non-null int64
x21            10000 non-null int64
x22            10000 non-null int64
x23            10000 non-null object
x24       

In [19]:
data.select_dtypes('object').columns # 12 of the features are non numeric

Index(['x2', 'x3', 'x4', 'x5', 'x8', 'x15', 'x16', 'x17', 'x23', 'x24', 'x25',
       'x26'],
      dtype='object')

In [9]:
data.describe()

Unnamed: 0,x6,y,x7,x9,x10,x11,x12,x13,x14,x18,x19,x20,x21,x22,Unnamed: 28
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,1.0
mean,1989.5859,0.427953,28.2124,2.4009,0.7221,0.027339,0.165077,0.055035,74.4865,0.0379,0.0205,0.3508,0.5402,0.0413,3.0
std,22.920972,20.726503,11.398424,2.319335,0.844596,0.034747,0.111654,0.369048,49.218869,0.190964,0.155827,0.477244,0.498406,0.198993,
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
25%,1984.0,0.0,20.0,1.0,0.0,0.0,0.08072,0.0,42.0,0.0,0.0,0.0,0.0,0.0,3.0
50%,1993.0,0.0,25.0,2.0,1.0,0.018692,0.144578,0.0,61.0,0.0,0.0,0.0,1.0,0.0,3.0
75%,1998.0,0.0,34.0,3.0,1.0,0.041096,0.226441,0.0,93.0,0.0,0.0,1.0,1.0,0.0,3.0
max,2006.0,2000.0,100.0,26.0,4.0,1.0,1.0,7.0,976.0,1.0,7.0,1.0,1.0,1.0,3.0


In [10]:
# {‘pearson’, ‘kendall’, ‘spearman’}
data.corr(method='pearson').style.format('{:.2}').background_gradient(cmap='coolwarm')

  xa[xa < 0] = -1


Unnamed: 0,x6,y,x7,x9,x10,x11,x12,x13,x14,x18,x19,x20,x21,x22,Unnamed: 28
x6,1.0,-0.84,-0.48,-0.034,-0.013,-0.35,0.0076,0.013,-0.0044,-0.022,-0.36,-0.16,0.028,-0.0082,
y,-0.84,1.0,-0.021,-0.0064,0.013,0.27,-0.014,0.0065,-0.012,0.047,0.43,-0.004,-0.01,0.044,
x7,-0.48,-0.021,1.0,0.087,-0.00083,0.2,0.011,-0.041,0.036,-0.045,-0.063,0.34,-0.038,-0.069,
x9,-0.034,-0.0064,0.087,1.0,0.29,0.088,0.47,0.032,0.46,0.023,0.031,0.081,0.19,-0.011,
x10,-0.013,0.013,-0.00083,0.29,1.0,0.15,-0.065,-0.048,0.35,-0.028,0.0091,0.075,0.21,-0.065,
x11,-0.35,0.27,0.2,0.088,0.15,1.0,-0.11,-0.056,0.18,-0.04,0.091,0.25,0.15,0.017,
x12,0.0076,-0.014,0.011,0.47,-0.065,-0.11,1.0,0.081,0.071,0.048,0.053,0.015,-0.02,-0.035,
x13,0.013,0.0065,-0.041,0.032,-0.048,-0.056,0.081,1.0,0.018,0.75,0.55,-0.0063,-0.047,-0.0032,
x14,-0.0044,-0.012,0.036,0.46,0.35,0.18,0.071,0.018,1.0,0.074,0.066,0.049,0.33,-0.031,
x18,-0.022,0.047,-0.045,0.023,-0.028,-0.04,0.048,0.75,0.074,1.0,0.64,-0.061,-0.021,0.0088,


## Target

In [0]:
y = np.where(data['y'] > 0, 1, 0)

In [17]:
(y == 0).sum(), (y == 1).sum() # so imbalanced

(9850, 150)

## Predictors

### x1
UID

In [22]:
data['x1'].describe() # x1 is just uid, can be dropped

count       10000
unique      10000
top       id89789
freq            1
Name: x1, dtype: object

### x2
OS version

In [27]:
data['x2'].describe()

count     10000
unique       18
top       8.0.0
freq       3454
Name: x2, dtype: object

In [28]:
data['x2'].value_counts() # can all possibly be condensed to the major version

8.0.0    3454
7        1729
8.1.0    1474
7.1.1     931
6.0.1     905
9         474
7.1.2     309
5.1.1     295
6         207
4.4.2      61
5.1        55
5          37
5.0.1      28
5.0.2      26
4.4.4      12
4.4.3       1
4.4         1
7.1         1
Name: x2, dtype: int64

### x4
language

In [30]:
data['x4'].value_counts() # can be lowercased then [:2]

US                  5765
GB                   870
CA                   687
FR                   411
SG                   407
SE                   331
FI                   328
AU                   297
NZ                   278
DK                   212
DE                   186
NO                   134
CA_US                 18
MX                     8
MY                     7
VN                     6
NL                     5
DK_US                  2
MX_US                  2
ES                     2
GB_US                  2
CH                     2
NZ_US                  2
TR                     2
TH                     2
AT                     2
IN                     1
EU_GB                  1
CA_GB_US               1
PL                     1
JP                     1
PT                     1
GB_NL_US               1
GR                     1
PE                     1
AT_DE                  1
DZ                     1
AU_SG                  1
LTD._NS-P11A8100       1
NO_SE                  1


### x5

In [32]:
data['x5'].value_counts() # 

0     6285
1     3508
2      206
US       1
Name: x5, dtype: int64

### x6, x7
Year or birth and age as of 2018

In [33]:
data[['x6', 'x7']].head() # can drop x6

Unnamed: 0,x6,x7
0,1997,21
1,1999,19
2,1996,22
3,1976,42
4,1989,29


### x8


In [34]:
data['x8'].value_counts()

0        9801
1         182
2          14
3           2
FALSE       1
Name: x8, dtype: int64

### x9, x10, x11, x12, x13, x14

In [37]:
tmp = ['x9', 'x10', 'x11', 'x12', 'x13', 'x14'] 
data[tmp].head()

Unnamed: 0,x9,x10,x11,x12,x13,x14
0,1,0,0.0,0.036145,0.0,83
1,2,0,0.0,0.529412,0.0,102
2,1,0,0.031915,0.06383,0.0,94
3,2,1,0.0,0.2,0.0,50
4,2,2,0.028571,0.071429,0.0,70


In [38]:
data[tmp].describe() # x14 can be reduced by /1000

Unnamed: 0,x9,x10,x11,x12,x13,x14
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,2.4009,0.7221,0.027339,0.165077,0.055035,74.4865
std,2.319335,0.844596,0.034747,0.111654,0.369048,49.218869
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,0.0,0.0,0.08072,0.0,42.0
50%,2.0,1.0,0.018692,0.144578,0.0,61.0
75%,3.0,1.0,0.041096,0.226441,0.0,93.0
max,26.0,4.0,1.0,1.0,7.0,976.0


### x15
phone

In [40]:
data['x15'].value_counts() # can be lowercased

samsung                   4867
LGE                       1648
HUAWEI                     824
motorola                   604
ZTE                        378
OnePlus                    193
OPPO                       160
Sony                       160
Google                     136
TCL                        126
Xiaomi                      98
asus                        72
AlcatelOneTouch             65
HMD Global                  51
ALCATEL                     50
Alcatel                     48
HTC                         45
Alco                        44
LENOVO                      36
google                      29
Coolpad                     27
Yulong                      23
Vodafone                    22
BLU                         22
ANS                         15
WIKO                        15
Razer                       14
Acer                        13
Quanta                      10
Blackview                   10
                          ... 
blackberry                   1
SKY DEVI

### x16
Some comapnies name

In [43]:
data['x16'].value_counts() # should have been a binary

0                            9706
1                             293
SHENZHEN GIEC DIGITAL CO.       1
Name: x16, dtype: int64

### x17
Some companies

In [44]:
data['x17'].value_counts()

8       4928
7       2969
6       1112
9        474
5        441
4         75
LTD.       1
Name: x17, dtype: int64

### x18, 19, 20, 21, 22


In [45]:
tmp = ['x18', 'x19', 'x20', 'x21', 'x22']
data[tmp].describe()

Unnamed: 0,x18,x19,x20,x21,x22
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,0.0379,0.0205,0.3508,0.5402,0.0413
std,0.190964,0.155827,0.477244,0.498406,0.198993
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,1.0,0.0
75%,0.0,0.0,1.0,1.0,0.0
max,1.0,7.0,1.0,1.0,1.0


### x23

In [47]:
data['x23'].value_counts() # almost binary

TRUE     5952
FALSE    4047
0           1
Name: x23, dtype: int64

### x24
Referral

In [48]:
data['x24'].value_counts()

komistplay-i27ca6ba3e32c55c    2669
komistplay-i277fc4c38ac926c    1248
573880939902959083              440
komistplay-i278aa7320f12d13     342
578274137794105583              268
574253038517356133              260
komistplay-i27bfe4daaba1aa4     206
512399732703117729              163
komistplay-i2748777614c52b2     158
576465400783883159              150
492786776143062094              120
578273416764105287              109
komistplay-i272fc84fda669f3     108
282202902410497492               96
586206336715673895               86
568854315415129371               82
492787488847698857               71
576422175842834761               70
459856103233242576               65
381122606079794609               64
586205984360772448               63
497056817508328233               55
576417976967144256               48
komistplay-i27abe6a50fb8529      44
komistplay-i273dcf86faab892      44
489173088290878094               43
578596448757830182               42
568854641757307119          

### x25
Referral

In [49]:
data['x25'].value_counts()

Facebook                 2930
Bidalgo - Facebook       2669
Bidalgo - Instagram      1248
Fluent                    342
YouTube                   192
Google Adwords            158
Fyber - Android           127
UnityAds - Android        108
Snapchat                   17
Youtube                    15
Bidalgo - Snapchat         15
YeahMobi                   12
AppLovin - Android          9
App Samurai - Android       9
Instagram                   6
S1bVnL4qM                   5
TapJoy - Android            5
rJCLgqHf7                   3
HWy431lw4                   3
U6APtP6dD                   3
HJY9lKXvG                   3
CxssTFyT8                   3
ByjV5rZXX                   2
9bCZTf5KH                   2
PN71L5oPo                   2
Bk-o2n2Ff                   2
XXFwmxgUJ                   2
inVKh7F11                   2
gNwQLYK4P                   2
Cm51jQuEs                   2
                         ... 
rkjvAbX_M                   1
IKBORstWN                   1
TVOSFcsXq 

### x26
Campaign?

In [50]:
data['x26'].value_counts()

Bidalgo Facebook                    2669
Default Facebook                    1257
Play Games. Earn Money.              566
Fortnite                             455
LLC                                  342
Amazon                               290
Play Games Earn Money                269
Gamer Giftcards                      259
Copy Tests                           224
Facebook                             207
Google UAC                           158
Fyber                                127
Unity                                108
Pirate Kings                         101
Yahtzee                               97
Mistplay Android/iOS                  97
Clash of Clans                        80
Starbucks                             54
Lookalike                             43
Campaign 2018-10-05 12:10:35:839      43
Gamer GIftcards                       40
Discover New Games                    36
Snapchat                              32
Clash Royale                          24
Spiele Spielen  

## Training

In [0]:
from sklearn.model_selection import StratifiedKFold, cross_validate, GridSearchCV
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


# helper functions 
def print_metrics(y_true, y_predicted, y_predicted_proba):
    print(classification_report(y_true, y_predicted))
    print(confusion_matrix(y_true, y_predicted))
    print('AUC', roc_auc_score(y_true, y_predicted_proba))
    

def train(preditors, target, model):
    '''
    a small wraper to train model given predictors X and target y
    
    Print out after training
    
    Returns:
    [int] predicted class
    [float] predicted_prob for the class 1
        
    '''
    model.fit(preditors, target)
    predicted = model.predict(preditors)
    predicted_prob = model.predict_proba(preditors)[:,1]
    print_metrics(target, predicted, predicted_prob)
    return predicted, predicted_prob

def train_cv(preditors, target, model, cv=None):
    '''
    train model with CV
    
    print out metrics for each fold
    
    Returns:
    [float] roc_auc_score for the test set
    '''
    scores = np.array([])
    
    cv = cv if cv is not None else StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

    for i, j in cv.split(preditors, target):
        Xi, yi = preditors.iloc[i], target[i]
        Xj, yj = preditors.iloc[j], target[j]
        
        model.fit(Xi, yi)
    
        predicted = model.predict(Xj)
        predicted_prob = model.predict_proba(Xj)[:,1]
    
        scores = np.append(scores, roc_auc_score(yj, predicted_prob))
        print_metrics(yj, predicted, predicted_prob)

    print(scores, '--', scores.mean(), scores.std())
    
    return scores
    

In [0]:
X = data.copy().drop(data.select_dtypes('object').columns, 1).drop(['y', 'x6', 'Unnamed: 28'], 1)

X['x7'] = X['x7'] / 100
X['x14'] = X['x14'] / 1000

### LR

Starting with Linear Regression, using numeric features only. Then add categaorical features one by one

In [81]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.64      0.78      9850
          1       0.03      0.65      0.05       150

avg / total       0.98      0.64      0.77     10000

[[6326 3524]
 [  53   97]]
AUC 0.6960940778341793
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1316  654]
 [  11   19]]
AUC 0.6761928934010152
             precision    recall  f1-score   support

          0       0.99      0.65      0.78      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.65      0.77      2000

[[1273  697]
 [  11   19]]
AUC 0.7042131979695432
             precision    recall  f1-score   support

          0       0.99      0.61      0.76      1970
          1       0.02      0.63      0.05        30

avg / total       0.98      0.61      0.75  

#### x2

In [82]:
tmp = pd.get_dummies(data['x2'].str[0], prefix='x2') # only take the major version
# data.groupby(data['x2'].str[0])['y'].agg(['sum', 'size'])
X['x2_6'] = tmp['x2_6']
X['x2_7'] = tmp['x2_7']
X['x2_8'] = tmp['x2_8']
X['x2_9'] = tmp['x2_9']

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model); # a bit better

             precision    recall  f1-score   support

          0       0.99      0.64      0.78      9850
          1       0.03      0.65      0.05       150

avg / total       0.98      0.64      0.77     10000

[[6326 3524]
 [  53   97]]
AUC 0.6960940778341793
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1316  654]
 [  11   19]]
AUC 0.6761928934010152
             precision    recall  f1-score   support

          0       0.99      0.65      0.78      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.65      0.77      2000

[[1273  697]
 [  11   19]]
AUC 0.7042131979695432
             precision    recall  f1-score   support

          0       0.99      0.61      0.76      1970
          1       0.02      0.63      0.05        30

avg / total       0.98      0.61      0.75  

#### x4


In [103]:
tmp = pd.get_dummies(data['x4'].str.lower().str[:2], prefix='x4') # language, take the first 2 letter
for i in tmp[tmp.sum().sort_values()[:-11:-1].index.values]:
    X[i] = tmp[i]
    
model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.65      0.79      9850
          1       0.03      0.69      0.06       150

avg / total       0.98      0.65      0.78     10000

[[6439 3411]
 [  46  104]]
AUC 0.7195526226734349
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.02      0.53      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1323  647]
 [  14   16]]
AUC 0.6674111675126904
             precision    recall  f1-score   support

          0       0.99      0.65      0.79      1970
          1       0.03      0.60      0.05        30

avg / total       0.98      0.65      0.78      2000

[[1288  682]
 [  12   18]]
AUC 0.678984771573604
             precision    recall  f1-score   support

          0       0.99      0.63      0.77      1970
          1       0.03      0.70      0.05        30

avg / total       0.98      0.63      0.76   

#### x5

In [105]:
tmp = pd.get_dummies(data['x5'].str.lower(), prefix='x5')
for i in tmp[tmp.sum().sort_values()[1:].index.values]:
    X[i] = tmp[i]


model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.65      0.79      9850
          1       0.03      0.69      0.06       150

avg / total       0.98      0.65      0.78     10000

[[6432 3418]
 [  47  103]]
AUC 0.719805752961083
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.02      0.53      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1328  642]
 [  14   16]]
AUC 0.668984771573604
             precision    recall  f1-score   support

          0       0.99      0.66      0.79      1970
          1       0.02      0.57      0.05        30

avg / total       0.98      0.65      0.78      2000

[[1291  679]
 [  13   17]]
AUC 0.6732656514382402
             precision    recall  f1-score   support

          0       0.99      0.63      0.77      1970
          1       0.03      0.70      0.05        30

avg / total       0.98      0.63      0.76    

#### x8

In [106]:
tmp = pd.get_dummies(data['x8'].str.lower(), prefix='x8')
for i in tmp[tmp.sum().sort_values()[1:].index.values]:
    X[i] = tmp[i]

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.65      0.79      9850
          1       0.03      0.69      0.06       150

avg / total       0.98      0.65      0.78     10000

[[6433 3417]
 [  47  103]]
AUC 0.7198693739424704
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.02      0.53      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1326  644]
 [  14   16]]
AUC 0.6684433164128595
             precision    recall  f1-score   support

          0       0.99      0.65      0.79      1970
          1       0.02      0.57      0.05        30

avg / total       0.98      0.65      0.78      2000

[[1290  680]
 [  13   17]]
AUC 0.6728764805414552
             precision    recall  f1-score   support

          0       0.99      0.63      0.77      1970
          1       0.03      0.70      0.05        30

avg / total       0.98      0.63      0.76  

#### x15

In [107]:
tmp = pd.get_dummies(data['x15'].str.lower(), prefix='x15')
for i in tmp[tmp.sum().sort_values()[:-11:-1].index.values]:
    X[i] = tmp[i]

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.66      0.79      9850
          1       0.03      0.71      0.06       150

avg / total       0.98      0.66      0.78     10000

[[6463 3387]
 [  44  106]]
AUC 0.7413617597292725
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.03      0.57      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1329  641]
 [  13   17]]
AUC 0.6805076142131979
             precision    recall  f1-score   support

          0       0.99      0.65      0.78      1970
          1       0.03      0.60      0.05        30

avg / total       0.98      0.65      0.77      2000

[[1276  694]
 [  12   18]]
AUC 0.681082910321489
             precision    recall  f1-score   support

          0       0.99      0.64      0.78      1970
          1       0.02      0.60      0.05        30

avg / total       0.98      0.64      0.77   

#### x16

In [108]:
tmp = pd.get_dummies(data['x16'].str.lower(), prefix='x16')
for i in tmp[tmp.sum().sort_values()[1:].index.values]:
    X[i] = tmp[i]
    

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

x16_1
x16_0
             precision    recall  f1-score   support

          0       0.99      0.66      0.79      9850
          1       0.03      0.74      0.06       150

avg / total       0.98      0.66      0.78     10000

[[6490 3360]
 [  39  111]]
AUC 0.7457516074450085
             precision    recall  f1-score   support

          0       0.99      0.68      0.80      1970
          1       0.03      0.57      0.05        30

avg / total       0.98      0.68      0.79      2000

[[1334  636]
 [  13   17]]
AUC 0.6893570219966159
             precision    recall  f1-score   support

          0       0.99      0.66      0.79      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.66      0.78      2000

[[1294  676]
 [  11   19]]
AUC 0.6884940778341793
             precision    recall  f1-score   support

          0       0.99      0.64      0.78      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.64

#### x17


In [109]:
tmp = pd.get_dummies(data['x17'].str.lower(), prefix='x17')
for i in tmp[tmp.sum().sort_values()[1:].index.values]:
    X[i] = tmp[i]
    
model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.66      0.79      9850
          1       0.03      0.75      0.06       150

avg / total       0.98      0.66      0.78     10000

[[6484 3366]
 [  38  112]]
AUC 0.747306937394247
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.03      0.57      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1317  653]
 [  13   17]]
AUC 0.6898477157360406
             precision    recall  f1-score   support

          0       0.99      0.65      0.79      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.65      0.78      2000

[[1290  680]
 [  11   19]]
AUC 0.6899830795262267
             precision    recall  f1-score   support

          0       0.99      0.64      0.78      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.64      0.76   

#### x23



In [110]:
tmp = pd.get_dummies(data['x23'].str.lower(), prefix='x23')
for i in tmp[tmp.sum().sort_values()[1:].index.values]:
    X[i] = tmp[i]

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.66      0.79      9850
          1       0.03      0.74      0.06       150

avg / total       0.98      0.66      0.78     10000

[[6475 3375]
 [  39  111]]
AUC 0.7478693739424703
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.03      0.57      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1326  644]
 [  13   17]]
AUC 0.69331641285956
             precision    recall  f1-score   support

          0       0.99      0.66      0.79      1970
          1       0.03      0.67      0.06        30

avg / total       0.98      0.66      0.78      2000

[[1294  676]
 [  10   20]]
AUC 0.6825888324873096
             precision    recall  f1-score   support

          0       0.99      0.64      0.78      1970
          1       0.02      0.60      0.05        30

avg / total       0.98      0.64      0.76    

#### x24

In [112]:
tmp = pd.get_dummies(data['x24'].fillna('').str.lower().str[:2], prefix='x24')
for i in tmp[tmp.sum().sort_values()[:-11:-1].index.values]:
    X[i] = tmp[i]
    

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.65      0.79      9850
          1       0.03      0.73      0.06       150

avg / total       0.98      0.65      0.77     10000

[[6400 3450]
 [  40  110]]
AUC 0.7540480541455162
             precision    recall  f1-score   support

          0       0.99      0.66      0.79      1970
          1       0.02      0.57      0.05        30

avg / total       0.98      0.66      0.78      2000

[[1297  673]
 [  13   17]]
AUC 0.6868697123519459
             precision    recall  f1-score   support

          0       0.99      0.66      0.80      1970
          1       0.03      0.67      0.06        30

avg / total       0.98      0.67      0.79      2000

[[1310  660]
 [  10   20]]
AUC 0.6789340101522843
             precision    recall  f1-score   support

          0       0.99      0.63      0.77      1970
          1       0.02      0.60      0.05        30

avg / total       0.98      0.63      0.76  

#### x25

In [113]:
tmp = pd.get_dummies(data['x25'].fillna('').str.lower(), prefix='x25')
for i in tmp[tmp.sum().sort_values()[:-11:-1].index.values]:
    X[i] = tmp[i]
    
model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       0.99      0.65      0.79      9850
          1       0.03      0.77      0.06       150

avg / total       0.98      0.65      0.78     10000

[[6426 3424]
 [  34  116]]
AUC 0.7608013536379018
             precision    recall  f1-score   support

          0       0.99      0.66      0.79      1970
          1       0.03      0.60      0.05        30

avg / total       0.98      0.66      0.78      2000

[[1296  674]
 [  12   18]]
AUC 0.6969543147208122
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.03      0.63      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1325  645]
 [  11   19]]
AUC 0.6890186125211506
             precision    recall  f1-score   support

          0       0.99      0.64      0.78      1970
          1       0.02      0.53      0.04        30

avg / total       0.97      0.64      0.76  

#### x26

In [114]:
tmp = data['x26'].str.lower().replace(to_replace='.*facebook.*', value='facebook', regex=True)
tmp = tmp.replace(to_replace='.*play games.*', value='play games earn money', regex=True)
tmp = pd.get_dummies(tmp, prefix='x26')
for i in tmp[tmp.sum().sort_values().sort_values()[:-11:-1].index.values]:
    X[i] = tmp[i]


model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

train(X, y, model);
train_cv(X, y, model);

             precision    recall  f1-score   support

          0       1.00      0.65      0.79      9850
          1       0.03      0.80      0.06       150

avg / total       0.98      0.65      0.77     10000

[[6386 3464]
 [  30  120]]
AUC 0.7646869712351945
             precision    recall  f1-score   support

          0       0.99      0.65      0.78      1970
          1       0.03      0.67      0.05        30

avg / total       0.98      0.65      0.77      2000

[[1279  691]
 [  10   20]]
AUC 0.7013536379018612
             precision    recall  f1-score   support

          0       0.99      0.67      0.80      1970
          1       0.03      0.60      0.05        30

avg / total       0.98      0.67      0.79      2000

[[1316  654]
 [  12   18]]
AUC 0.682994923857868
             precision    recall  f1-score   support

          0       0.99      0.64      0.77      1970
          1       0.02      0.53      0.04        30

avg / total       0.97      0.63      0.76   

#### Corr
Individually, each var is not much correlated with y

However, a few vars are highly correlated with each other, such as:
- x5_1 and x20
- x4_fr and x22, maybe x22 is "speaking French?"

In [122]:
pd.concat([pd.DataFrame({ 'y': y }), X], 1).corr(method='pearson').style.format('{:.2}').background_gradient(cmap='coolwarm')

Unnamed: 0,y,x7,x9,x10,x11,x12,x13,x14,x18,x19,x20,x21,x22,x2_6,x2_7,x2_8,x2_9,x4_fr,x4_dk,x4_us,x4_gb,x4_ca,x4_sg,x4_se,x4_fi,x4_au,x4_nz,x5_2,x5_1,x5_0,x8_3,x8_2,x8_1,x8_0,x15_samsung,x15_lge,x15_huawei,x15_motorola,x15_zte,x15_oneplus,x15_google,x15_sony,x15_oppo,x15_tcl,x16_1,x16_0,x17_4,x17_5,x17_9,x17_6,x17_7,x17_8,x23_false,x23_true,x24_ko,x24_,x24_57,x24_58,x24_56,x24_49,x24_51,x24_28,x24_45,x24_59,x25_facebook,x25_bidalgo - facebook,x25_,x25_bidalgo - instagram,x25_fluent,x25_youtube,x25_google adwords,x25_fyber - android,x25_unityads - android,x25_snapchat,x26_facebook,x26_play games earn money,x26_fortnite,x26_llc,x26_gamer giftcards,x26_amazon,x26_copy tests,x26_google uac,x26_fyber,x26_unity
y,1.0,0.026,0.058,0.034,0.061,0.0076,-0.011,0.05,-0.0073,0.031,0.039,0.031,-0.017,0.0035,-0.026,0.028,0.0073,-0.021,0.0043,-0.011,-0.012,0.027,0.0037,0.00016,0.023,-0.012,-0.001,-0.0063,0.039,-0.038,-0.0017,-0.0046,-0.0045,-8.8e-05,0.0066,-0.0083,0.0078,0.0066,-0.0073,0.00063,-0.016,0.017,-0.016,-0.014,0.0078,-0.013,-0.0012,-0.022,0.0073,0.0035,-0.028,0.028,-0.023,0.021,0.015,0.001,-0.012,-0.0032,0.0082,-0.0048,-0.016,-0.012,-0.0023,0.0072,-0.013,0.022,0.001,0.0032,-0.0051,-0.018,-0.0024,-0.014,-0.0049,0.015,0.023,-0.0045,-0.019,-0.0051,-0.017,0.0081,-0.0025,-0.0024,-0.014,-0.0049
x7,0.026,1.0,0.087,-0.00083,0.2,0.011,-0.041,0.036,-0.045,-0.063,0.34,-0.038,-0.069,-0.0085,0.045,-0.036,-0.0037,-0.068,-0.041,0.17,-0.032,-0.0072,-0.069,-0.048,-0.095,-0.03,-0.017,-0.023,0.34,-0.32,-0.0058,-0.024,-0.053,0.06,-0.04,0.079,-0.072,0.041,0.047,-0.055,-0.0009,-0.0085,-0.028,0.029,0.016,-0.015,0.024,-0.0058,-0.0037,-0.0085,0.046,-0.036,0.008,-0.0075,0.16,-0.04,-0.13,-0.078,0.089,0.0097,-0.077,-0.056,-0.02,-0.0038,-0.099,0.31,-0.04,-0.19,0.037,-0.089,-0.062,-0.0038,0.041,-0.026,0.15,-0.084,-0.12,0.037,-0.095,0.091,-0.09,-0.062,-0.0038,0.041
x9,0.058,0.087,1.0,0.29,0.088,0.47,0.032,0.46,0.023,0.031,0.081,0.19,-0.011,-0.1,-0.098,0.17,0.056,-0.011,0.00042,0.012,-0.0062,0.025,-0.04,-0.00017,0.01,0.0083,-0.041,-0.014,0.081,-0.076,-0.0055,-0.0076,0.045,-0.04,0.098,-0.045,-0.0023,-0.032,-0.061,0.046,0.019,-0.008,-0.023,-0.034,-0.022,0.022,-0.021,-0.08,0.056,-0.1,-0.098,0.17,-0.16,0.16,-0.005,-0.017,-0.034,-0.021,0.043,0.033,0.014,-0.021,-0.0041,0.055,0.026,0.024,-0.017,-0.052,-0.0071,0.0097,0.018,-0.052,-0.0097,-0.013,0.0067,0.008,-0.081,-0.0071,-0.034,0.041,-0.045,0.018,-0.052,-0.0097
x10,0.034,-0.00083,0.29,1.0,0.15,-0.065,-0.048,0.35,-0.028,0.0091,0.075,0.21,-0.065,-0.11,-0.096,0.16,0.11,-0.066,0.0025,0.15,-0.048,-0.033,-0.081,-0.021,-0.017,-0.0095,-0.036,-0.013,0.075,-0.07,-0.0037,-0.0035,0.0041,-0.0036,0.12,-0.0071,-0.037,-0.029,-0.046,0.058,0.03,-0.042,-0.024,-0.037,-0.069,0.068,-0.051,-0.11,0.11,-0.11,-0.096,0.16,-0.2,0.2,0.083,-0.065,-0.042,-0.013,0.042,-0.0031,0.015,-0.02,-0.006,-0.026,-0.019,0.05,-0.065,-0.0022,0.13,-0.027,0.0037,-0.005,-0.024,-0.015,0.041,-0.042,-0.029,0.13,-0.011,0.037,0.0031,0.0037,-0.005,-0.024
x11,0.061,0.2,0.088,0.15,1.0,-0.11,-0.056,0.18,-0.04,0.091,0.25,0.15,0.017,-0.088,-0.05,0.11,0.089,0.0033,0.014,0.01,0.035,0.0017,-0.015,-0.019,-0.037,-0.033,-0.033,-0.047,0.25,-0.24,-0.0039,-0.0098,-0.038,0.019,-0.014,0.041,0.069,-0.00052,-0.042,-0.034,0.062,0.0089,-0.054,-0.045,-0.031,0.014,-0.04,-0.092,0.089,-0.088,-0.057,0.11,-0.11,0.1,0.16,-0.075,-0.067,-0.072,0.053,-0.043,-0.048,-0.054,0.011,-0.029,-0.07,0.17,-0.075,-0.013,0.074,-0.076,-0.059,-0.024,0.005,0.0022,0.15,-0.044,-0.082,0.074,-0.053,0.052,-0.076,-0.059,-0.024,0.005
x12,0.0076,0.011,0.47,-0.065,-0.11,1.0,0.081,0.071,0.048,0.053,0.015,-0.02,-0.035,-0.0095,0.042,-0.011,-0.022,-0.035,0.022,-0.0089,0.031,0.032,-0.071,0.0062,0.017,-0.0083,0.0019,0.016,0.015,-0.02,-0.011,0.0079,0.068,-0.065,-0.058,0.079,0.11,0.034,-0.069,-0.031,0.052,-0.0095,-0.1,-0.049,0.012,-0.011,0.022,-0.039,-0.022,-0.0095,0.042,-0.011,0.0016,-0.0013,-0.13,0.049,0.024,0.0042,0.031,0.051,0.034,0.041,0.014,0.033,0.079,-0.043,0.049,-0.1,-0.081,0.062,0.053,-0.052,0.00041,-0.011,-0.096,0.066,-0.018,-0.081,-0.027,0.028,-0.023,0.053,-0.052,0.00041
x13,-0.011,-0.041,0.032,-0.048,-0.056,0.081,1.0,0.018,0.75,0.55,-0.0063,-0.047,-0.0032,0.039,-0.012,-0.046,-0.0014,-0.0036,0.019,0.0031,-0.024,-0.0041,0.0035,0.012,0.0091,-0.01,-0.011,0.011,-0.0063,0.0029,0.055,0.19,0.57,-0.61,0.0082,-0.028,-0.0054,-0.019,-0.0084,0.036,0.015,-0.0082,-0.0017,-0.0071,0.079,-0.079,0.037,0.064,-0.0014,0.039,-0.012,-0.046,0.023,-0.023,-0.063,0.087,-0.018,0.015,-0.022,-0.0041,-0.00029,-0.0036,-0.011,0.1,-0.039,-0.039,0.087,-0.02,-0.025,-0.0065,0.00066,2.6e-05,-0.0051,-0.0062,-0.052,-0.0058,-0.021,-0.025,-0.0071,-0.023,-0.014,0.00066,2.6e-05,-0.0051
x14,0.05,0.036,0.46,0.35,0.18,0.071,0.018,1.0,0.074,0.066,0.049,0.33,-0.031,-0.14,-0.2,0.26,0.14,-0.03,0.051,-0.11,0.017,-0.017,0.15,0.041,0.0064,0.08,-0.033,-0.0073,0.049,-0.046,0.021,0.031,0.074,-0.08,0.18,-0.19,-0.079,-0.1,-0.066,0.2,-0.012,-0.018,0.26,-0.024,-0.045,0.046,-0.051,-0.11,0.14,-0.14,-0.2,0.26,-0.26,0.26,0.076,-0.07,0.02,-0.061,0.017,0.0016,-0.019,-0.036,0.00049,-0.0089,0.013,0.041,-0.07,0.032,0.043,-0.05,-0.013,-0.031,-0.0053,-0.013,0.071,0.062,-0.087,0.043,-0.0095,0.019,-0.063,-0.013,-0.031,-0.0053
x18,-0.0073,-0.045,0.023,-0.028,-0.04,0.048,0.75,0.074,1.0,0.64,-0.061,-0.021,0.0088,-0.01,-0.0018,-0.027,0.015,0.0063,0.039,-0.032,-0.021,-0.014,0.012,0.031,0.031,-0.0042,-0.0051,0.023,-0.061,0.053,0.071,0.19,0.65,-0.69,-0.0047,-0.039,0.013,-0.024,-0.0038,0.067,0.011,-0.0086,-0.0044,-0.0083,0.071,-0.074,0.074,0.039,0.015,-0.01,-0.0029,-0.027,0.0081,-0.0092,-0.045,0.062,-0.0044,-0.0084,-0.028,0.0035,0.007,-0.0034,-0.013,0.046,-0.024,-0.03,0.062,-0.01,-0.032,-0.0031,0.013,0.0055,-0.00047,-0.0082,-0.036,0.012,-0.026,-0.032,0.0021,-0.028,-0.013,0.013,0.0055,-0.00047
x19,0.031,-0.063,0.031,0.0091,0.091,0.053,0.55,0.066,0.64,1.0,-0.055,0.013,0.024,-0.012,-0.0041,0.0064,0.016,0.0018,0.047,-0.031,-0.023,-0.011,0.0086,0.026,0.044,-0.016,-0.0029,0.031,-0.055,0.036,0.089,0.24,0.86,-0.92,-0.0074,-0.014,0.012,-0.0065,-0.019,0.08,0.0081,-0.0065,-0.017,-0.0091,0.015,-0.042,-0.011,-0.00013,0.016,-0.012,-0.014,0.0064,-0.014,0.0052,-0.027,0.018,0.0019,0.0023,-0.015,-0.0015,0.018,0.00021,-0.012,0.031,-0.0086,-0.024,0.018,0.00081,-0.025,0.0079,0.0091,0.0023,0.011,-0.0054,-0.02,0.023,-0.019,-0.025,-0.00049,-0.015,-0.0073,0.0091,0.0023,0.011


In [123]:
# what if using only cat variables. Results not as good

# tmp = X[X.columns[~X.columns.str.contains('_')]]
tmp = X[X.columns[X.columns.str.contains('_')]]
# tmp = X

model = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')

print(tmp.shape)
train(tmp, y, model)
print('===')
train_cv(tmp, y, model)

(10000, 71)
             precision    recall  f1-score   support

          0       0.99      0.57      0.73      9850
          1       0.03      0.76      0.05       150

avg / total       0.98      0.57      0.72     10000

[[5624 4226]
 [  36  114]]
AUC 0.7274744500846024
===
             precision    recall  f1-score   support

          0       0.99      0.56      0.71      1970
          1       0.02      0.67      0.04        30

avg / total       0.98      0.56      0.70      2000

[[1100  870]
 [  10   20]]
AUC 0.6654483925549914
             precision    recall  f1-score   support

          0       0.99      0.65      0.79      1970
          1       0.02      0.50      0.04        30

avg / total       0.97      0.65      0.78      2000

[[1286  684]
 [  15   15]]
AUC 0.6313451776649746
             precision    recall  f1-score   support

          0       0.99      0.58      0.73      1970
          1       0.02      0.53      0.04        30

avg / total       0.97      

array([0.66544839, 0.63134518, 0.63622673, 0.50845178, 0.64807953])

## GaussianNB

In [125]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

train(X, y, model)
print('===')
train_cv(X, y, model);

# better recall for class 1, but lower recall for class 0

             precision    recall  f1-score   support

          0       1.00      0.16      0.27      9850
          1       0.02      0.99      0.03       150

avg / total       0.98      0.17      0.27     10000

[[1551 8299]
 [   1  149]]
AUC 0.5996148900169205
===
             precision    recall  f1-score   support

          0       1.00      0.15      0.26      1970
          1       0.02      0.97      0.03        30

avg / total       0.98      0.16      0.26      2000

[[ 295 1675]
 [   1   29]]
AUC 0.5925042301184433
             precision    recall  f1-score   support

          0       1.00      0.15      0.26      1970
          1       0.02      1.00      0.03        30

avg / total       0.99      0.16      0.26      2000

[[ 293 1677]
 [   0   30]]
AUC 0.5964636209813874
             precision    recall  f1-score   support

          0       1.00      0.11      0.20      1970
          1       0.02      0.97      0.03        30

avg / total       0.98      0.12      0.

## Random Forest

In [126]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')

train(X, y, model)
print('===')
train_cv(X, y, model); # very bad CV


             precision    recall  f1-score   support

          0       1.00      1.00      1.00      9850
          1       1.00      1.00      1.00       150

avg / total       1.00      1.00      1.00     10000

[[9850    0]
 [   0  150]]
AUC 1.0
===


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1970    0]
 [  30    0]]
AUC 0.5770473773265652


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1970    0]
 [  30    0]]
AUC 0.5553722504230119


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1970    0]
 [  30    0]]
AUC 0.5590609137055838


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1970    0]
 [  30    0]]
AUC 0.5390016920473774
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1970    0]
 [  30    0]]
AUC 0.5473604060913706
[0.57704738 0.55537225 0.55906091 0.53900169 0.54736041] -- 0.5555685279187818 0.012770352931263225


  'precision', 'predicted', average, warn_for)


## KNN

bad, not run



In [0]:
# from sklearn.neighbors import KNeighborsClassifier

# model = KNeighborsClassifier(20, weights='distance')

# train(X, y, model)
# print('===')
# train_cv(X, y, model);

## MLPClassifier
Took too long, results were no good, so not run

In [0]:
# from sklearn.neural_network import MLPClassifier

# model = MLPClassifier(random_state=0)

# train(X, y, model)
# print('===')
# train_cv(X, y, model);

## GaussianProcessClassifier
Took too long, not run

In [0]:
# from sklearn.gaussian_process import GaussianProcessClassifier
# from sklearn.gaussian_process.kernels import RBF

# model = GaussianProcessClassifier(kernel=1.0 * RBF(1.0), random_state=0)

# train(X, y, model)
# print('===')
# train_cv(X, y, model)

## AdaBoost

In [127]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier(random_state=0)

train(X, y, model)
print('===')
train_cv(X, y, model); # bad cv, overfit

             precision    recall  f1-score   support

          0       0.99      1.00      0.99      9850
          1       0.33      0.01      0.01       150

avg / total       0.98      0.98      0.98     10000

[[9848    2]
 [ 149    1]]
AUC 0.8219424703891709
===


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1970    0]
 [  30    0]]
AUC 0.6685109983079526
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1968    2]
 [  30    0]]
AUC 0.5818104906937395


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1970    0]
 [  30    0]]
AUC 0.6852115059221658


  'precision', 'predicted', average, warn_for)


             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1970    0]
 [  30    0]]
AUC 0.5714551607445009
             precision    recall  f1-score   support

          0       0.98      1.00      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.98      0.98      2000

[[1969    1]
 [  30    0]]
AUC 0.7134517766497461
[0.668511   0.58181049 0.68521151 0.57145516 0.71345178] -- 0.6440879864636211 0.057013992318556604


array([0.668511  , 0.58181049, 0.68521151, 0.57145516, 0.71345178])

## QuadraticDiscriminantAnalysis

In [128]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

model = QuadraticDiscriminantAnalysis()

train(X, y, model)
print('===')
train_cv(X, y, model); # variables detected collinear, bad CV



             precision    recall  f1-score   support

          0       1.00      0.17      0.29      9850
          1       0.02      0.99      0.04       150

avg / total       0.98      0.18      0.29     10000

[[1680 8170]
 [   1  149]]
AUC 0.5883620981387478
===
             precision    recall  f1-score   support

          0       0.99      0.17      0.29      1970
          1       0.02      0.93      0.03        30

avg / total       0.98      0.18      0.28      2000

[[ 332 1638]
 [   2   28]]
AUC 0.5279526226734349
             precision    recall  f1-score   support

          0       1.00      0.14      0.24      1970
          1       0.02      1.00      0.03        30

avg / total       0.99      0.15      0.24      2000

[[ 274 1696]
 [   0   30]]
AUC 0.5730964467005076
             precision    recall  f1-score   support

          0       0.99      0.25      0.39      1970
          1       0.02      0.87      0.03        30

avg / total       0.98      0.26      0.

array([0.52795262, 0.57309645, 0.56284264, 0.46150592, 0.55888325])


## SVC

Very slow,

LinearSVC also doesn't support predict_prob

In [0]:
# from sklearn.svm import SVC

# model = SVC(gamma='scale', class_weight='balanced', probability=True, random_state=0)

# train(X, y, model)
# print('===')
# train_cv(X, y, model)

In [0]:
# from sklearn.svm import LinearSVC

# model = LinearSVC(max_iter=2000, class_weight='balanced', random_state=0)

# model.fit(X, y)
# train(X, y, model)
# print('===')
# train_cv(X, y, model)

## SGDClassifier

Also bad

In [0]:
# from sklearn.linear_model import SGDClassifier

# model = SGDClassifier(
#     loss='log',
#     max_iter=1000,
#     tol=1e-3,
#     class_weight='balanced',
#     random_state=0,
# )

# train(X, y, model)
# print('===')
# train_cv(X, y, model)

## LGBM

In [129]:
from lightgbm import LGBMClassifier

model = LGBMClassifier(class_weight='balanced')

train(X, y, model)
print('===')
train_cv(X, y, model);

  if diff:


             precision    recall  f1-score   support

          0       1.00      0.99      0.99      9850
          1       0.51      1.00      0.68       150

avg / total       0.99      0.99      0.99     10000

[[9706  144]
 [   0  150]]
AUC 0.9999181049069374
===


  if diff:


             precision    recall  f1-score   support

          0       0.98      0.99      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.97      0.97      2000

[[1948   22]
 [  30    0]]
AUC 0.5473434856175974


  if diff:


             precision    recall  f1-score   support

          0       0.98      0.98      0.98      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.97      0.97      2000

[[1940   30]
 [  30    0]]
AUC 0.6092216582064298


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.98      0.98      1970
          1       0.05      0.07      0.06        30

avg / total       0.97      0.97      0.97      2000

[[1935   35]
 [  28    2]]
AUC 0.603824027072758


  if diff:


             precision    recall  f1-score   support

          0       0.98      0.98      0.98      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.97      0.97      2000

[[1933   37]
 [  30    0]]
AUC 0.5810659898477157
             precision    recall  f1-score   support

          0       0.98      0.99      0.99      1970
          1       0.00      0.00      0.00        30

avg / total       0.97      0.97      0.97      2000

[[1942   28]
 [  30    0]]
AUC 0.5961590524534687
[0.54734349 0.60922166 0.60382403 0.58106599 0.59615905] -- 0.587522842639594 0.022213612986427503


  if diff:


array([0.54734349, 0.60922166, 0.60382403, 0.58106599, 0.59615905])

## XGBoost

In [130]:
from xgboost import XGBClassifier

scale_pos_weight = np.sum(y == 0) / np.sum(y == 1)

model = XGBClassifier(n_jobs=-1, random_state=0, scale_pos_weight=scale_pos_weight)

train(X, y, model)
print('===')
train_cv(X, y, model);

  if diff:


             precision    recall  f1-score   support

          0       1.00      0.77      0.87      9850
          1       0.06      0.91      0.11       150

avg / total       0.98      0.77      0.86     10000

[[7560 2290]
 [  13  137]]
AUC 0.9240060913705584
===


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.81      0.89      1970
          1       0.02      0.30      0.04        30

avg / total       0.97      0.81      0.88      2000

[[1601  369]
 [  21    9]]
AUC 0.6452707275803723


  if diff:


             precision    recall  f1-score   support

          0       0.98      0.80      0.88      1970
          1       0.01      0.17      0.02        30

avg / total       0.97      0.79      0.87      2000

[[1569  401]
 [  25    5]]
AUC 0.5925549915397631


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.77      0.86      1970
          1       0.02      0.33      0.04        30

avg / total       0.97      0.76      0.85      2000

[[1510  460]
 [  20   10]]
AUC 0.6537140439932319


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.75      0.85      1970
          1       0.02      0.27      0.03        30

avg / total       0.97      0.74      0.84      2000

[[1480  490]
 [  22    8]]
AUC 0.5695262267343485
             precision    recall  f1-score   support

          0       0.99      0.81      0.89      1970
          1       0.03      0.37      0.05        30

avg / total       0.97      0.80      0.88      2000

[[1588  382]
 [  19   11]]
AUC 0.6518443316412859
[0.64527073 0.59255499 0.65371404 0.56952623 0.65184433] -- 0.6225820642978004 0.034804605699143204


  if diff:


array([0.64527073, 0.59255499, 0.65371404, 0.56952623, 0.65184433])

### Hyperopt

Hyper parameter tuning with Baysian optimisation  

In [0]:
from hyperopt import STATUS_OK
from hyperopt import fmin
from hyperopt import hp
from hyperopt.pyll.stochastic import sample
from hyperopt import tpe
from hyperopt import Trials

from timeit import default_timer as timer

import xgboost

xgtrain = xgboost.DMatrix(X.values, label=y)

def objective(params, folds = cv):
    """Objective function for Gradient Boosting Machine Hyperparameter Optimization"""
    
    # Keep track of evals
    global ITERATION
    
    ITERATION += 1
    
    start = timer()
    
    # Make sure parameters that need to be integers are integers
    for parameter_name in ['max_depth', 'min_child_weight']:
        params[parameter_name] = int(params[parameter_name])

    params['scale_pos_weight'] = scale_pos_weight
    params['verbosity'] = 0
    
    # Perform n_folds cross validation    
    cv_results = xgboost.cv(params, xgtrain, num_boost_round=10000, folds=folds,
        metrics='auc', early_stopping_rounds=50, seed=0)
    
    run_time = timer() - start
    
    # Extract the best score
    best_score = np.max(cv_results['test-auc-mean'])
    
    # Loss must be minimized
    loss = 1 - best_score
    
    # Boosting rounds that returned the highest cv score
    n_estimators = int(np.argmax(cv_results['test-auc-mean']) + 1)
    
#     print('\n\n=====', loss, params, ITERATION, n_estimators)


    # Write to the csv file ('a' means append)
#     of_connection = open('./gbm_trials.csv', 'a')
#     writer = csv.writer(of_connection)
#     writer.writerow([loss, params, ITERATION, n_estimators, run_time])
    
    # Dictionary with information for evaluation
    return {'loss': loss, 'params': params, 'iteration': ITERATION,
            'estimators': n_estimators, 
            'train_time': run_time, 'status': STATUS_OK}


In [0]:
# Keep track of results
bayes_trials = Trials()

In [0]:
# Global variable
global ITERATION

ITERATION = 0

# Define the search space
space = {
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.3)),
    'gamma': hp.uniform('gamma', 0, 1),
    'max_depth': hp.quniform('max_depth', 3, 8, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 12, 2),
    'subsample': hp.uniform('subsample', 0.6, 1.0),
    'colsample_bytree': hp.uniform('colsample_by_tree', 0.6, 1.0),

#     'num_leaves': hp.quniform('num_leaves', 30, 150, 1),
#     'subsample_for_bin': hp.quniform('subsample_for_bin', 20000, 300000, 20000),
#     'min_child_samples': hp.quniform('min_child_samples', 20, 500, 5),
    'reg_alpha': hp.uniform('reg_alpha', 0.1, 10),
    'reg_lambda': hp.uniform('reg_lambda', 0.1, 10),
}

# Run optimization
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest, 
    max_evals=10000,
    trials=bayes_trials,
    rstate=np.random.RandomState(0)
);

In [0]:
# take a long time to train, so save the best param for next use
# import pickle

# with open('trials.pkl', 'wb') as handle:
#     pickle.dump(bayes_trials, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# # with open('trials.pickle', 'rb') as handle: bayes_trials = pickle.load(handle)

# bayes_trials.best_trial['result']


In [134]:
# best param
params = {
    'colsample_bytree': 0.9492405652889128,
    'gamma': 0.2174707094781907,
    'learning_rate': 0.2087465061696673,
    'max_depth': 4,
    'min_child_weight': 4,
    'reg_alpha': 0.4475446011595685,
    'reg_lambda': 0.16373566452669497,
    'scale_pos_weight': 65.66666666666667,
    'subsample': 0.781163089515965,
    'verbosity': 0,
    'scale_pos_weight': scale_pos_weight
}
model = XGBClassifier(n_jobs=-1, random_state=0, **params)


train(X, y, model)
print('===')
train_cv(X, y, model); # mmm, not that "best", still low recall

  if diff:


             precision    recall  f1-score   support

          0       1.00      0.94      0.97      9850
          1       0.21      1.00      0.34       150

avg / total       0.99      0.94      0.96     10000

[[9277  573]
 [   0  150]]
AUC 0.9958971235194585
===


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.96      0.97      1970
          1       0.02      0.07      0.03        30

avg / total       0.97      0.94      0.96      2000

[[1883   87]
 [  28    2]]
AUC 0.5573096446700508


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.95      0.97      1970
          1       0.03      0.10      0.04        30

avg / total       0.97      0.93      0.95      2000

[[1866  104]
 [  27    3]]
AUC 0.5272081218274112


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.94      0.96      1970
          1       0.02      0.07      0.03        30

avg / total       0.97      0.93      0.95      2000

[[1861  109]
 [  28    2]]
AUC 0.6412013536379019


  if diff:


             precision    recall  f1-score   support

          0       0.98      0.95      0.97      1970
          1       0.01      0.03      0.02        30

avg / total       0.97      0.93      0.95      2000

[[1868  102]
 [  29    1]]
AUC 0.5733671742808798
             precision    recall  f1-score   support

          0       0.98      0.95      0.97      1970
          1       0.01      0.03      0.02        30

avg / total       0.97      0.94      0.95      2000

[[1872   98]
 [  29    1]]
AUC 0.5651776649746193
[0.55730964 0.52720812 0.64120135 0.57336717 0.56517766] -- 0.5728527918781725 0.03756520567238148


  if diff:


array([0.55730964, 0.52720812, 0.64120135, 0.57336717, 0.56517766])

## Ensemble


In [137]:
model1 = LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')
model2 = XGBClassifier(n_jobs=-1, random_state=0, **params)
model3 = GaussianNB()
model4 = LinearSVC(max_iter=10000, class_weight='balanced', random_state=0, tol=0.0005)


model1.fit(X, y)
model2.fit(X, y)
model3.fit(X, y)
model4.fit(X, y)

pre1 = model1.predict(X)
preb1 = model1.predict_proba(X)[:,1]
pre2 = model2.predict(X)
preb2 = model2.predict_proba(X)[:,1]
pre3 = model3.predict(X)
preb3 = model3.predict_proba(X)[:,1]
pre4 = model4.predict(X)


  if diff:


In [138]:
# pre = np.where(pre1 + pre2 + pre3 + pre4 > 1, 1, 0)
# pre1.sum(), pre2.sum(), pre3.sum(), pre4.sum(), pre.sum()

# print(confusion_matrix(y, pre))

# pre = np.where(pre1 + pre2 + pre3 > 2, 1, 0)
# pre1.sum(), pre2.sum(), pre3.sum(), pre.sum()

# print(confusion_matrix(y, pre))


# pre = np.where(preb1 + preb2 + preb3 > 2, 1, 0)
# pre1.sum(), pre2.sum(), pre3.sum(), pre.sum()

# print(confusion_matrix(y, pre));

[[9384  466]
 [  30  120]]
[[8869  981]
 [   4  146]]


In [140]:
pre = pd.DataFrame({
    'm1': pre1,
    'm2': pre2,
    'm3': pre3,
})

pre.corr(method='pearson') # good that they are not correlated

Unnamed: 0,m1,m2,m3
m1,1.0,0.263206,0.318043
m2,0.263206,1.0,0.11539
m3,0.318043,0.11539,1.0


In [186]:
from sklearn.ensemble import VotingClassifier


model = VotingClassifier(
    estimators=[
        ('model1', LogisticRegression(random_state=0, solver='lbfgs', max_iter=5000, class_weight='balanced')),
        ('model2', XGBClassifier(n_jobs=-1, random_state=0, **params)),
        ('model3', GaussianNB()),
#         ('model4', QuadraticDiscriminantAnalysis()),
#         ('model5', LGBMClassifier(class_weight='balanced'))
    ],
#     voting='hard',
#     voting='soft',
    voting='soft', weights=[10, 1, 1]
)


# tmp = X[X.columns[~X.columns.str.contains('_')]]
# tmp = X[X.columns[X.columns.str.contains('_')]]
# tmp = X[X.columns[~X.columns.str.contains('x17_|x25_|x22', regex=True)]]
tmp = X

print(tmp.shape)

# model.fit(tmp, y)
# print_all(y, model.predict(tmp), model.predict(tmp))
# scores = np.array([])
# for i, j in cv.split(X, y):
#     Xi, yi = X.iloc[i], y[i]
#     Xj, yj = X.iloc[j], y[j]
#     model.fit(Xi, yi)
#     predicted = model.predict(Xj)
#     scores = np.append(scores, roc_auc_score(yj, predicted))
#     print_all(yj, predicted, predicted)
# print(scores, '--', scores.mean(), scores.std())

    
train(tmp, y, model);
print('===')
train_cv(tmp, y, model);

(10000, 83)


  if diff:


             precision    recall  f1-score   support

          0       1.00      0.62      0.77      9850
          1       0.03      0.89      0.07       150

avg / total       0.98      0.63      0.76     10000

[[6151 3699]
 [  16  134]]
AUC 0.8276805414551609
===


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.63      0.77      1970
          1       0.03      0.67      0.05        30

avg / total       0.98      0.63      0.76      2000

[[1245  725]
 [  10   20]]
AUC 0.7001353637901861


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.65      0.78      1970
          1       0.03      0.60      0.05        30

avg / total       0.98      0.65      0.77      2000

[[1277  693]
 [  12   18]]
AUC 0.679746192893401


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.62      0.76      1970
          1       0.02      0.63      0.05        30

avg / total       0.98      0.62      0.75      2000

[[1221  749]
 [  11   19]]
AUC 0.6765482233502538


  if diff:


             precision    recall  f1-score   support

          0       0.99      0.63      0.77      1970
          1       0.02      0.50      0.04        30

avg / total       0.97      0.63      0.76      2000

[[1236  734]
 [  15   15]]
AUC 0.5398984771573604
             precision    recall  f1-score   support

          0       0.99      0.65      0.78      1970
          1       0.02      0.57      0.05        30

avg / total       0.98      0.64      0.77      2000

[[1272  698]
 [  13   17]]
AUC 0.6582741116751268
[0.70013536 0.67974619 0.67654822 0.53989848 0.65827411] -- 0.6509204737732656 0.05707848763505407


  if diff:


In [0]:
score1 = train_cv(X, y, model1) # LR
score2 = train_cv(X, y, model2) # XGB
score3 = train_cv(X, y, model3) # NB
score = train_cv(X, y, model) # Voting

In [187]:
tmp = pd.DataFrame({
    'LR': score1,
    'XGB': score2,
    'NB': score3,
    'Ensemble': score,
})
tmp.loc['mean'] = tmp.mean()
tmp.loc['std'] = tmp.std()
tmp

Unnamed: 0,Ensemble,LR,NB,XGB
0,0.700135,0.701354,0.592504,0.55731
1,0.679746,0.682995,0.596464,0.527208
2,0.676548,0.674196,0.57165,0.641201
3,0.539898,0.536904,0.499594,0.573367
4,0.658274,0.658156,0.579442,0.565178
mean,0.65092,0.650721,0.567931,0.572853
std,0.057078,0.058594,0.035311,0.037565
