# 使用傳統的ML 嘗試

In [1]:
import numpy as np
import tensorflow as tf
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import os
import pickle
import numpy as np
from sklearn import preprocessing
from tqdm import tqdm_notebook
from scipy.stats import skew
plt.style.use(["dark_background"])
%matplotlib inline


  from ._conv import register_converters as _register_converters


In [2]:
os.chdir("../")
os.getcwd()

'/project/at082-group17'

## 讀入1~4月的data 並合併

In [3]:
## load 'data'
dataNameList = ['201812', '201901', '201902' ,'201903']
dataCount = []
df_all = pd.DataFrame()
for name in dataNameList:
    with open('data/pickle/' + name + '.p', "rb") as file:
        loadData = pickle.load(file)
        dataCount.append(len(loadData.index))
        df_all = pd.concat((df_all, loadData), axis = 0, ignore_index = False)
print('data count ' + str(len(df_all.index)))

data count 38512321


In [4]:
df_all.describe()

Unnamed: 0,domain_count,dayofweek,time
count,38512320.0,38512320.0,38512320.0
mean,2676249.0,2.285623,-0.4457066
std,1759893.0,1.537794,0.5657999
min,1.0,0.0,-1.0
25%,1051705.0,1.0,-0.9238795
50%,2760272.0,2.0,-0.6593458
75%,4825825.0,4.0,-0.03686173
max,4833533.0,6.0,1.0


## 合併信件的type

In [5]:
mail = pd.read_csv('data/mail_dataV2.csv')

In [6]:
mail = mail.drop(['fromName', 'fromMailAddress', 'subject', 'scheduledDate'], axis = 1)

In [8]:
mail = mail.drop(['Unnamed: 0'], axis = 1)

In [9]:
mail

Unnamed: 0,campaignSn,mailType
0,000000005f59c23b015f73f2f5107a39,0
1,000000005f59c23b015f73f416917a5e,0
2,000000005f59c23b015f73f6d5207ac6,0
3,000000005f59c23b015f73f755077adc,0
4,000000005f59c23b015f710327035577,1
5,000000005f59c23b015f5b3da4dd1bd4,2
6,000000005f59c23b015f71d2723d78ff,0
7,000000005f59c23b015f74e9266313c6,4
8,000000005f59c23b015f74edf4f014b3,0
9,000000005f59c23b015f702e7e7c5651,0


In [10]:
df_all = df_all.merge(mail, how="left", on="campaignSn")

## 先將train資料切出來(要取開信與收信率)

In [None]:
trainCount = df_all[df_all['scheduledDate'].apply(lambda x :x.month) <= 3].count()
trainCount = trainCount[0]

In [None]:
df_train = df_all[:trainCount]

## 移除收信資料少於5封的收件者

In [None]:
df_temp = df_all[["recipientSn", "mailAddress"]].groupby("mailAddress").count().reset_index()

df_temp.columns = ["mailAddress", "mailAddress_count"]
df_temp.sort_values(by=['mailAddress_count'])

df_all = df_all.merge(df_temp, how="left", on="mailAddress")
df_all = df_all[df_all['mailAddress_count'] >= 5]
#del df_all['mailAddress_count']

## 移除沒有開過信的使用者

In [None]:
df_temp = df_train.groupby("mailAddress")['openedFlag'].agg(['sum'])
df_temp.columns = ["openSum"]
df_all = df_all.merge(df_temp, how="left", on="mailAddress")
df_all = df_all[df_all['openSum'] > 0]
#del df_all['sum']

In [None]:
trainCount = df_all[df_all['scheduledDate'].apply(lambda x :x.month) <= 3].count()
trainCount = trainCount[0]

## Standardization
正規化 將數值轉為[0,1] or [-1,1] <br>
只對數值類型的做

In [None]:
df_all[ 'time' ] = preprocessing.scale( df_all[ 'time'] ,with_mean=True,with_std=True)
df_all[ 'domain_count' ] = preprocessing.scale( df_all[ 'domain_count'],with_mean=True,with_std=True )
df_all[ 'mailAddress_count' ] = preprocessing.scale( df_all[ 'mailAddress_count'],with_mean=True,with_std=True )
df_all[ 'openSum' ] = preprocessing.scale( df_all[ 'openSum'],with_mean=True,with_std=True )


In [None]:
data = df_all['time']
plt.hist(data, bins= 30)

In [None]:
df_all['dayofweek'] = df_all['dayofweek'].astype( str )
df_all['mailType'] = df_all['mailType'].astype( str )


In [None]:
df_all = df_all.drop(['domain','recipientSn', 'campaignSn', 'mailAddress', 'fromMailAddress', 'scheduledDate'], axis = 1)

In [None]:
df_all

## One-Hot Encoding
要訓練非數值的feature 需要做One-Hot EncodingOne hot

In [None]:
df_all_dum = pd.get_dummies(df_all)


## 讀存檔

In [None]:
## save data as pickle format
with open('data/pickle/' + 'df_all_dum' + '.p', "wb") as file:
    pickle.dump(df_all_dum, file)

In [9]:
## load 'df_all_dum'
with open('data/pickle/' + 'df_all_dum' + '.p', "rb") as file:
    df_all_dum = pickle.load(file)

## Training 開始
資料分割


In [10]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

print("df_all_dum = ", df_all_dum.shape)
print("df_train = ",trainCount)

#將train的部分取出(因為前面有先將train和test的資料和並)
X = df_all_dum[:trainCount]
X = X.drop(['openedFlag'], axis = 1)
y = df_all_dum['openedFlag']
y = y[:trainCount]
print("X = ",X.shape)

#切分為trainData和ValData
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state = 77, stratify=y)

#將後半剩下的test資料取出
y_test = df_all_dum[ trainCount : ]
y_test = y_test[ 'openedFlag' ]
X_test = df_all_dum[ trainCount : ]
X_test = X_test.drop(['openedFlag'], axis = 1)

print("X_test = ",X_test.shape)


df_all_dum =  (14335246, 12)
df_train =  10836556
X =  (10836556, 11)
X_test =  (3498690, 11)


### Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score


  from numpy.core.umath_tests import inner1d


In [20]:
clf = RandomForestClassifier(verbose = 1)
clf.fit(X_train, y_train)

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  6.4min finished


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=1,
            warm_start=False)

In [20]:
with open('data/pickle/RandomForestModel', "wb") as file:
    pickle.dump(clf, file)

In [None]:

threshold_RF = 0.8

y_pred_proba_RF = clf.predict_proba(X_val)
y_pred_RF = (y_pred_proba_RF [:,1] >= threshold_RF).astype('int')

print('acc ' + str(accuracy_score(y_val, y_pred_RF)))
print("recall = " + str(recall_score(y_val, y_pred_RF)))
print('f1_score ' + str(f1_score(y_val, y_pred_RF)))

In [22]:
def errorCount(y_pred, y_val):
    errorCount = 0
    missCount = 0
    for i in range(y_val.shape[0]):
        if y_val.data[i] != y_pred.data[i] and y_pred.data[i] == 1:
            errorCount = errorCount + 1
        if y_val.data[i] != y_pred.data[i] and y_pred.data[i] == 0:
            missCount = missCount + 1
    print('error count : ' + str(errorCount))
    print('miss count : ' + str(missCount))

In [23]:
errorCount(y_pred_RF, y_val)

  """
  import sys


error count : 24605
miss count : 228629


In [24]:
y_test_proba = clf.predict_proba(X_test)
y_ans_pred = (y_test_proba [:,1] >= threshold_RF).astype('int')

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    8.1s finished


In [25]:
print('acc ' + str(accuracy_score(y_test, y_ans_pred)))
print("recall = " + str(recall_score(y_test, y_ans_pred)))
print('f1_score ' + str(f1_score(y_test, y_ans_pred)))

acc 0.749054360346301
recall = 0.25923827983976844
f1_score 0.36736361455037525


### lightGBM

In [16]:
import lightgbm as lgb
pos_weight = 5
lgbm = lgb.LGBMClassifier(objective = 'binary', scale_pos_weight=pos_weight)
lgbm.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=-1,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=100, n_jobs=-1, num_leaves=31, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.0,
        scale_pos_weight=5, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=0)

In [22]:
with open('data/pickle/LightGBMModel', "wb") as file:
    pickle.dump(lgbm, file)

In [19]:
threshold_GBM = 0.5

y_pred_proba_GBM = lgbm.predict_proba(X_val)
y_pred_GBM = (y_pred_proba_GBM [:,1] >= threshold_GBM).astype('int')
print("acc = " + str(accuracy_score(y_val, y_pred_GBM)))
print("recall = " + str(recall_score(y_val, y_pred_GBM)))
print("f1 = " + str(f1_score(y_val, y_pred_GBM)))

acc = 0.6472173826380327
recall = 0.9042156351530224
f1 = 0.6227968651177752


In [29]:
errorCount(y_pred_GBM, y_val)

  """
  import sys


error count : 59078
miss count : 95929


In [18]:
sendRate = y_pred_GBM.sum() / X_val.shape[0]
print(sendRate)
print( y_pred_GBM.sum())
print(X_val.shape[0])

0.9137538574167853
753878
825034


In [100]:
 y_pred_GBM.sum()

825034

In [26]:
threshold_GBM = 0.8

y_pred_proba_GBM = lgbm.predict_proba(X_test)
y_pred_GBM = (y_pred_proba_GBM [:,1] >= threshold_GBM).astype('int')
print("acc = " + str(accuracy_score(y_test, y_pred_GBM)))
print("recall = " + str(recall_score(y_test, y_pred_GBM)))
print("f1 = " + str(f1_score(y_test, y_pred_GBM)))

acc = 0.7913750295723617
recall = 0.5536058676322264
f1 = 0.5701224922622933


In [27]:
sendRate = y_pred_GBM.sum() / y_test.shape[0]
print(sendRate)
print( y_pred_GBM.sum())
print(y_test.shape[0])

0.23541673916948505
669694
2844717


## Naive Bayes Classification


In [12]:
from sklearn.naive_bayes import GaussianNB
modelNB = GaussianNB()
modelNB.fit(X_train, y_train)

GaussianNB(priors=None)

In [17]:
threshold_NB = 0.1

y_pred_proba_NB = modelNB.predict_proba(X_val)
y_pred_NB = (y_pred_proba_NB [:,1] >= threshold_NB).astype('int')
print("acc = " + str(accuracy_score(y_val, y_pred_NB)))
print("recall = " + str(recall_score(y_val, y_pred_NB)))
print("f1 = " + str(f1_score(y_val, y_pred_NB)))

acc = 0.41662298736868525
recall = 0.9806580447750076
f1 = 0.519892219152365
