In [3]:
#Stacking

import pandas as pd
import numpy as np
from sklearn import preprocessing
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM,GRU
from sklearn.linear_model import LogisticRegression
import sklearn
from keras import layers
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
import time
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt
import xgboost as xgb
from xgboost.sklearn import XGBClassifier as xgbc

#ROC AUC
def paintROC(y_test1,y_pred1):
    fpr,tpr,threshold = roc_curve(y_test1, y_pred1) ###real p value/ false p value
    roc_auc = auc(fpr,tpr) ###auc

    plt.figure()
    lw = 2
    plt.figure(figsize=(10,10))
    plt.plot(fpr, tpr, color='blue',
            lw=lw, label='ROC curve (area = %0.4f)' % roc_auc) ###
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()
    
#LSTM
def LSTM_():
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    model.add(LSTM(128))
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop')
    return model

#XGBOOST
def xgboost_():
    model=xgbc(silent=0,
        learning_rate=0.3,
        min_child_weight=1,
        max_depth=9,
        gamma=0.3,
        subsample=0.9,
        max_delta_step=0,
        colsample_bytree=0.8,
        colsample_bylevel=1,
        reg_alpha=0.01,
        reg_lambda=1,
        scale_pos_weight=1,
        n_estimators=150,
        objective='binary:logistic',
        seed=0)
    return model

#CNN
def CNN_1v():
    model = Sequential()
    model.add(Embedding(max_features, 128, input_length=maxlen))
    model.add(layers.Conv1D(32,5,activation='relu'))
    model.add(layers.MaxPooling1D(3))
    model.add(layers.Conv1D(32,5,activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop')
    return model

#Input data
#indata=pd.read_csv("../input/DGA_test.csv")
indata=pd.read_csv(r'E:\Data_test\DGA_test.csv')

X=indata['url']
labels=indata['label']

# Generate a dictionary of valid characters
valid_chars = {x:idx+1 for idx, x in enumerate(set(''.join(X)))}

max_features = len(valid_chars) + 1
maxlen = np.max([len(x) for x in X])
# Convert characters to int and pad
X = [[valid_chars[y] for y in x] for x in X]
X = sequence.pad_sequences(X, maxlen=maxlen)

#Standardization
#X=preprocessing.scale(X)
y=np.array(labels)

from sklearn.model_selection import StratifiedKFold  
cv_number=5# parameter
skf= StratifiedKFold(n_splits=cv_number) # x-fold cv 
skf.get_n_splits(X,y) 
print("CV method:StratifiedKFlod  Split number: ",cv_number) 

ac_test=[]
ac_train=[]
ac_test_auc=[]
ac_train_auc=[]
term=1

for train_index, test_index in skf.split(X, y):  
    print("Start:   current: ",term,"  ","Total: ",cv_number) 
    time_start=time.time()
    
    X_train,X_test = X[train_index], X[test_index]  
    y_train,y_test = y[train_index], y[test_index] 
    
    model1=LSTM_()
    model1.fit(X_train,y_train,batch_size=20,nb_epoch=1)
    print("LSTM Done")
    model2=xgboost_()
    model2.fit(X_train,y_train)
    print("XGBoost Done")
    model3=CNN_1v()
    model3.fit(X_train,y_train,batch_size=20,nb_epoch=1)
    print("CNN Done")
    meta_data_train=np.c_[model1.predict(X_train),model2.predict_proba(X_train),model3.predict(X_train)]
    
    #model stacking
    model_stack=LogisticRegression(class_weight='balanced')
    model_stack.fit(meta_data_train,y_train)
    meta_data_test=np.c_[model1.predict(X_test),model2.predict_proba(X_test),model3.predict(X_test)]
    y_pred_test=model_stack.predict_proba(meta_data_test)[:,1] 
    y_pred_train=model_stack.predict_proba(meta_data_train)[:,1] 
    ac_test.append(metrics.accuracy_score(y_test.astype('int'), model_stack.predict(meta_data_test).astype('int')))
    ac_train.append(metrics.accuracy_score(y_train.astype('int'),model_stack.predict(meta_data_train).astype('int')))
    ac_test_auc.append(sklearn.metrics.roc_auc_score(y_test, y_pred_test))
    ac_train_auc.append(sklearn.metrics.roc_auc_score(y_train,y_pred_train))
    
    time_end=time.time()
    print("Time:",round(time_end-time_start,4))
    term+=1
    
    #plot roc_curve
    paintROC(y_test,y_pred_test)
    
ac_test_auc=np.array(ac_test_auc)
ac_train_auc=np.array(ac_train_auc)
ac_test=np.array(ac_test)
ac_train=np.array(ac_train)
print("Mean Accuracy of Train Set: ",round(ac_train.mean(),4),'\n'
     " Mean Accuracy of the Test Set:",round(ac_test.mean(),4),'\n'
      " Mean AUC of the Train Set:",round(ac_train_auc.mean(),4),'\n'
      " Mean AUC of the Test Set:",round(ac_test_auc.mean(),4)
     )
    
    
    

CV method:StratifiedKFlod  Split number:  3
Start:   current:  1    Total:  3




Epoch 1/1
 9200/46666 [====>.........................] - ETA: 1:37 - loss: 0.2836- ETA: 1:37 - loss: 

KeyboardInterrupt: 