In [13]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from numpy import mean

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold


from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler


%matplotlib inline

In [6]:
train = pd.read_csv('data/train.csv')  
test = pd.read_csv('data/test.csv')

train.head()

Unnamed: 0,user_state,click,screen_resolution,video,score,reviews,editors_choice,price,iap,installs
0,TX,0,921600,1,4.3,155129,False,0.0,True,10000000
1,IL,0,921600,1,4.3,155129,False,0.0,True,10000000
2,FL,0,4096000,1,4.3,155129,False,0.0,True,10000000
3,FL,0,921600,1,4.3,155129,False,0.0,True,10000000
4,FL,0,4096000,1,4.3,155129,False,0.0,True,10000000


In [7]:
#state_train = pd.get_dummies(train.user_state, prefix='state')
#train = pd.concat([train, state_train], axis=1)
train.drop(['user_state'], axis=1, inplace=True)

#state_test = pd.get_dummies(test.user_state, prefix='state')
#test = pd.concat([test, state_test], axis=1)
test.drop(['user_state'], axis=1, inplace=True)

print(f'Click percent in train data: {train.click.sum()/train.shape[0]*100}')
print(f'Click percent in test data: {test.click.sum()/test.shape[0]*100}')

train.head()

Click percent in train data: 7.085805742490704
Click percent in test data: 7.876716084038773


Unnamed: 0,click,screen_resolution,video,score,reviews,editors_choice,price,iap,installs
0,0,921600,1,4.3,155129,False,0.0,True,10000000
1,0,921600,1,4.3,155129,False,0.0,True,10000000
2,0,4096000,1,4.3,155129,False,0.0,True,10000000
3,0,921600,1,4.3,155129,False,0.0,True,10000000
4,0,4096000,1,4.3,155129,False,0.0,True,10000000


In [15]:
X_train = train.drop('click', axis=1)
y_train = train['click']

X_test = test.drop('click', axis=1)
y_test = test['click']


# define model standart / outcome::: Mean ROC AUC
model_standart = RandomForestClassifier(n_estimators=10)

# define model Class Weighting / outcome::: Mean ROC AUC
model_class = RandomForestClassifier(n_estimators=10,  class_weight='balanced')

# define model bootstrap Class Weighting / outcome::: Mean ROC AUC
model_bootstrap = RandomForestClassifier(n_estimators=10,  class_weight='balanced_subsample')


# define evaluation procedure
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

# evaluate model
scores_standart = cross_val_score(model_standart, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
scores_class = cross_val_score(model_class, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)
scores_bootstrap = cross_val_score(model_bootstrap, X_train, y_train, scoring='roc_auc', cv=cv, n_jobs=-1)

# summarize performance
print('standart:::  Mean ROC AUC: %.3f' % mean(scores_standart))
print('class:::     Mean ROC AUC: %.3f' % mean(scores_class))
print('bootstrap::: Mean ROC AUC: %.3f' % mean(scores_bootstrap))


standart:::  Mean ROC AUC: 0.715
class:::     Mean ROC AUC: 0.712
bootstrap::: Mean ROC AUC: 0.712


In [19]:
model_class.fit(X_train, y_train)
y_pred= model_class.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

probas = model_class.predict_proba(X_test)

print(probas)





[[233471 102580]
 [ 11832  16901]]
              precision    recall  f1-score   support

           0       0.95      0.69      0.80    336051
           1       0.14      0.59      0.23     28733

    accuracy                           0.69    364784
   macro avg       0.55      0.64      0.52    364784
weighted avg       0.89      0.69      0.76    364784

[[0.91073162 0.08926838]
 [0.79519309 0.20480691]
 [0.79519309 0.20480691]
 ...
 [0.14010743 0.85989257]
 [0.84283046 0.15716954]
 [0.92272937 0.07727063]]
