In [0]:
!pip install mlxtend



In [0]:
import pandas as pd
import numpy as np
import tensorflow as tf

# Essemble
from mlxtend.classifier import StackingCVClassifier
from sklearn.naive_bayes import GaussianNB #Naive Bayes
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

#ML steps structure
from sklearn.pipeline import FeatureUnion, Pipeline

#Preprocessing
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler
from sklearn.feature_extraction.text import HashingVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD

from sklearn.model_selection import GridSearchCV

#Metrics
from sklearn.model_selection import cross_val_score

import io
from googleapiclient.http import MediaIoBaseDownload



### Pre processing

In [0]:
from google.colab import auth
auth.authenticate_user()

from googleapiclient.discovery import build
drive_service = build('drive', 'v3')

In [0]:
folder = ''
response = drive_service.files().list(q=" '"+folder +"' in parents",
 spaces='drive',
 fields='nextPageToken, files(id, name)').execute()

In [0]:
datasets=[]

page_token = None
while True:
    response = drive_service.files().list(q="'"+ folder +"' in parents",
                                          spaces='drive',
                                          fields='nextPageToken, files(id, name)',
                                          pageToken=page_token).execute()
    for file in response.get('files', []):
        # Process change
        file_id = file.get('id')
        request = drive_service.files().get_media(fileId=file_id)
        downloaded = io.BytesIO()
        downloader = MediaIoBaseDownload(downloaded, request)
        done = False
        while done is False:
          # _ is a placeholder for a progress object that we ignore.
          # (Our file is small, so we skip reporting progress.)
          _, done = downloader.next_chunk()

        downloaded.seek(0)

        if('.csv' in file.get('name')):
          print(file.get('name'))
          datasets.append(io.BytesIO(downloaded.read()))

    page_token1 = response.get('nextPageToken', None)
    if page_token1 is None:
        break

X_.csv
y_.csv


In [0]:
X_train = pd.read_csv(datasets[0])
y_ = pd.read_csv(datasets[1])
X_values = X_train.loc[:, '0':].values

In [0]:
labels_data = pd.get_dummies(y_['0'])
col_names = list(range(0,11))
labels_data.columns = col_names
labels_data = labels_data.idxmax(axis=1)

### Stacking

In [0]:
# Decision tree 
clfA1 = DecisionTreeClassifier(random_state=0, max_depth = 25)
clfA2 = DecisionTreeClassifier(random_state=10, max_depth = 10, min_samples_split = 4)
clfA3 = DecisionTreeClassifier(random_state=5, max_depth = 15)
clfA4 = DecisionTreeClassifier(random_state=15, max_depth = 25, min_weight_fraction_leaf = 0.2)
clfA5 = DecisionTreeClassifier(random_state=0, max_depth = 25, criterion = "entropy", max_features = 100)
clfA6 = DecisionTreeClassifier(random_state=8, max_depth = 25, max_features = 100)
clfA7 = DecisionTreeClassifier(random_state=0, max_depth = 20, max_features = "sqrt")
clfA8 = DecisionTreeClassifier(random_state=0, max_depth = 20, max_features = "log2")
clfA9 = DecisionTreeClassifier(random_state=0, max_depth = 25, max_features = 0.6, min_samples_split = 3)
clfA10 = DecisionTreeClassifier(random_state=0, max_depth = 25, splitter = "random")

In [0]:
# KNN configurations
clfB1 = KNeighborsClassifier(n_neighbors=7)
clfB2 = KNeighborsClassifier(n_neighbors=5, weights = "distance")
clfB3 = KNeighborsClassifier(n_neighbors=4, weights = "distance")
clfB4 = KNeighborsClassifier(n_neighbors=8)
clfB5 = KNeighborsClassifier(n_neighbors=7, metric = "minkowski", p = 1)
clfB6 = KNeighborsClassifier(n_neighbors=7, algorithm = "ball_tree")
clfB7 = KNeighborsClassifier(n_neighbors=6)
clfB8 = KNeighborsClassifier(n_neighbors=7, algorithm = "kd_tree", leaf_size = 50)
clfB9 = KNeighborsClassifier(n_neighbors=5, algorithm = "kd_tree")
clfB10 = KNeighborsClassifier(n_neighbors=7, algorithm = "brute")

In [0]:
# MLP configurations
clfC1 = MLPClassifier(hidden_layer_sizes=(20,), max_iter=100, alpha=1e-4,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.1, )
clfC2 = MLPClassifier(hidden_layer_sizes=(20,), max_iter=100, alpha=1e-4,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.2, )
clfC3 = MLPClassifier(hidden_layer_sizes=(10,), max_iter=100, alpha=1e-4,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.1, )
clfC4 = MLPClassifier(hidden_layer_sizes=(20,), max_iter=50, alpha=1e-4,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.1, )
clfC5 = MLPClassifier(hidden_layer_sizes=(20,), max_iter=100, alpha=1e-4,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.01, )
clfC6 = MLPClassifier(hidden_layer_sizes=(10,10), max_iter=100, alpha=1e-4,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.1, )
clfC7 = MLPClassifier(hidden_layer_sizes=(20,), max_iter=150, alpha=1e-4,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.1, )
clfC8 = MLPClassifier(hidden_layer_sizes=(20,), max_iter=100, alpha=1e-5,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 5,
                    learning_rate_init=.1, )
clfC9 = MLPClassifier(hidden_layer_sizes=(20,), max_iter=40, alpha=1e-5,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.1, )
clfC10 = MLPClassifier(hidden_layer_sizes=(20,), max_iter=100, alpha=1e-4,
                    solver='sgd', tol=1e-4, random_state=42, momentum = 0.8, 
                    validation_fraction = 0.3, early_stopping = True, n_iter_no_change = 10,
                    learning_rate_init=.5, )

In [0]:


meta = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1,
                    n_estimators=100, max_depth=3)


### 10 estimators

In [0]:
# First stack: 50% DT and 50% KNN, 10 estimators
stack1 = [clfA1, clfA2, clfA3, clfA4, clfA5, 
          clfB1, clfB1, clfB3, clfB4, clfB5]
sclf = StackingCVClassifier(classifiers=stack1,
                            meta_classifier=meta,
                           )
scores = cross_val_score(sclf, X_values, labels_data.values, cv=10, scoring='accuracy')
print(scores)

[0.93732355 0.94155844 0.94466403 0.94918125 0.94127612 0.93929983
 0.93675889 0.9378882  0.94099379 0.93873518]


In [0]:
# Second stack: 50% DT and 50% X, 10 estimators
stack2 = [clfA1, clfA2, clfA3, clfA4, clfA5, 
          clfC1, clfC1, clfC3, clfC4, clfC5]
sclf = StackingCVClassifier(classifiers=stack2,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=10, scoring='accuracy')
print(scores)

[0.89497459 0.92094862 0.92941841 0.94240542 0.9079616  0.92941841
 0.92461886 0.91389046 0.93224167 0.89920949]


In [0]:
# third stack: 50% KNN and 50% MLP, 10 estimators
stack3 = [clfB1, clfB2, clfB3, clfB4, clfB5, 
          clfC1, clfC1, clfC3, clfC4, clfC5]
sclf = StackingCVClassifier(classifiers=stack3,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.49788255 0.62111801 0.61405985 0.601214   0.61857708]


In [0]:
# forth stack: 33% KNN and 33% X, 10 estimators
stack4 = [clfA1, clfA2, clfA3, clfA4, clfB1, 
          clfB2, clfB3, clfC1, clfC2, clfC3]
sclf = StackingCVClassifier(classifiers=stack4,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.90993789 0.91600791 0.91727837 0.91883117 0.91600791]


## 15 estimators

In [0]:
# first stack: 50% KNN and 50% DT, 15 estimators
stack1 = [clfA1, clfA2, clfA3, clfA4, clfA5, clfA6, clfA7, clfA8 ,
          clfB1, clfB2, clfB3, clfB4, clfB5, clfB6, clfB7]
sclf = StackingCVClassifier(classifiers=stack1,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.95059289 0.95511011 0.95087521 0.94452287 0.94988707]


In [0]:
# second stack: 50% KNN and 50% MLP, 15 estimators
stack2 = [clfB1, clfB2, clfB3, clfB4, clfB5, clfB6, clfB7, clfB8 ,
          clfC1, clfC2, clfC3, clfC4, clfC5, clfC6, clfC7]
sclf = StackingCVClassifier(classifiers=stack2,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.61561265 0.61603614 0.60093168 0.63057595 0.54291361]


In [0]:
# third stack: 50% DT and 50% MLP, 15 estimators
stack3 = [clfA1, clfA2, clfA3, clfA4, clfA5, clfA6, clfA7, clfA8, 
          clfC1, clfC2, clfC3, clfC4, clfC5, clfC6, clfC7]
sclf = StackingCVClassifier(classifiers=stack3,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.93266516 0.95426313 0.94480519 0.92108978 0.94085263]


In [0]:
# forth stack: 33% DT, 33% KNN and 33% MLP, 10 estimators
stack4 = [clfA1, clfA2, clfA3, clfA4, clfA5, clfB1, clfB2, clfB3, clfB4, clfB5, 
          clfC1, clfC2, clfC3, clfC4, clfC5]
sclf = StackingCVClassifier(classifiers=stack4,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.92772445 0.932524   0.93732355 0.93111237 0.93704122]


### 20 estimators

In [0]:
# first stack: 50% KNN and 50% DT, 20 estimators
stack1 = [clfA1, clfA2, clfA3, clfA4, clfA5, clfA6, clfA7, clfA8, clfA9, clfA10,
          clfB1, clfB2, clfB3, clfB4, clfB5, clfB6, clfB7, clfB8, clfB9, clfB10]
sclf = StackingCVClassifier(classifiers=stack1,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.95412196 0.95962733 0.95539243 0.94734613 0.95327499]


In [0]:
# second stack: 50% KNN and 50% MLP, 15 estimators
stack2 = [clfB1, clfB2, clfB3, clfB4, clfB5, clfB6, clfB7, clfB8, clfB9, clfB10,
          clfC1, clfC2, clfC3, clfC4, clfC5, clfC6, clfC7, clfC8, clfC9, clfC10]
sclf = StackingCVClassifier(classifiers=stack2,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.49830604 0.62436477 0.62619989 0.63946923 0.5107284 ]


In [0]:
# third stack: 50% DT and 50% MLP, 15 estimators
stack3 = [clfA1, clfA2, clfA3, clfA4, clfA5, clfA6, clfA7, clfA8, clfA9, clfA10, 
          clfC1, clfC2, clfC3, clfC4, clfC5, clfC6, clfC7, clfC8, clfC9, clfC10]
sclf = StackingCVClassifier(classifiers=stack3,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.94932242 0.95750988 0.9512987  0.94438171 0.94085263]


In [0]:
# forth stack: 33% DT, 33% KNN and 33% MLP, 20 estimators
stack4 = [clfA1, clfA2, clfA3, clfA4, clfA5, clfA6, clfA7, 
          clfB1, clfB2, clfB3, clfB4, clfB5, clfB6, clfB7, 
          clfC1, clfC2, clfC3, clfC4, clfC5, clfC6]
sclf = StackingCVClassifier(classifiers=stack4,
                            meta_classifier=meta)
scores = cross_val_score(sclf, X_values, labels_data.values, cv=5, scoring='accuracy')
print(scores)

[0.93817053 0.95073405 0.94099379 0.93958216 0.94551101]
