In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import numpy as np
import pandas as pd 
import os

from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.pipeline import Pipeline
from sklearn.neural_network import  MLPClassifier
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import  Ridge
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import cloudpickle
import pickle 


import warnings
from matplotlib import pyplot as plt 
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None  # default='warn'

In [3]:
train = pd.read_csv('/content/drive/MyDrive/Code Review /Train.csv')
test = pd.read_csv('/content/drive/MyDrive/Code Review /Test.csv')

In [4]:
# DNN 
!cp "/content/drive/MyDrive/Code Review /DNN_train.csv" .
!cp "/content/drive/MyDrive/Code Review /DNN_test.csv" .

# DNN truncated
!cp '/content/drive/MyDrive/Code Review /DNN_truncated_train.csv' .
!cp '/content/drive/MyDrive/Code Review /DNN_truncated_test.csv' .

# mlp 
!cp "/content/drive/MyDrive/Code Review /mlp_train.csv" .
!cp "/content/drive/MyDrive/Code Review /mlp_test.csv" .



# mlp truncated
!cp "/content/drive/MyDrive/Code Review /mlp_truncated_train.csv" .
!cp "/content/drive/MyDrive/Code Review /mlp_truncated_test.csv" .

In [5]:
# Get all model soft pseudo-labels

dataset = {
    'MLP': ['mlp_train.csv', 'mlp_test.csv'],
    'MLP_truncated': ['mlp_truncated_train.csv', 'mlp_truncated_test.csv'],
    'DNN': ['DNN_train.csv', 'DNN_test.csv'],
    'DNN_truncated': ['DNN_truncated_train.csv', 'DNN_truncated_test.csv'],
} 

df_train = train[['Label']]
df_test = pd.DataFrame()

for k in dataset.keys():
    tmp_tr = pd.read_csv(dataset[k][0])
    tmp_tr.columns = [f'{k}_{c}' for c in tmp_tr.columns]
    df_train = pd.concat((df_train, tmp_tr), 1)
    tmp_ts = pd.read_csv(dataset[k][1])
    tmp_ts.columns = [f'{k}_{c}' for c in tmp_ts.columns]
    df_test = pd.concat((df_test, tmp_ts), 1)

In [6]:
LABEL = 'Label'
FEATURES  = df_train.columns.drop(LABEL)

## Generate Pseudo-Labeling

In [7]:
n_splits = 5

kf = StratifiedKFold(n_splits=n_splits, random_state=47, shuffle=True)
X_train = df_train[FEATURES].copy()
y_train = df_train[LABEL].copy()
lb = LabelEncoder()
y_train = lb.fit_transform(y_train) 


X_test = df_test[FEATURES].copy()
n_labels = df_train[LABEL].unique().shape[0] 
y_oof = np.zeros([X_train.shape[0], n_labels])
y_cv = np.zeros([X_train.shape[0]])
y_test = np.zeros([X_test.shape[0], n_labels])

i = 0
metrics = list()
for tr_idx, val_idx in kf.split(X_train, y_train):

    X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train[tr_idx], y_train[val_idx]
    model  = MLPClassifier(200, random_state=47, activation='logistic') 
    model.fit(X_tr, y_tr)   
    y_pred = model.predict(X_vl)
    y_oof[val_idx, :] = model.predict_proba(X_vl)
    y_cv[val_idx] = y_pred
    metric = accuracy_score(y_vl, y_pred)
    print("fold #{} val_loss: {}".format(i, metric))
    i += 1
    y_test += model.predict_proba(X_test) / n_splits
    metrics.append(metric)


metrics = np.array(metrics).mean()
print(f'Full accuracy {metrics}') 

fold #0 val_loss: 0.6631944444444444
fold #1 val_loss: 0.6933797909407665
fold #2 val_loss: 0.6759581881533101
fold #3 val_loss: 0.6689895470383276
fold #4 val_loss: 0.6794425087108014
Full accuracy 0.6761928958575301


In [8]:
# Get teacher features
teacher_train = pd.DataFrame(y_oof)
teacher_test = pd.DataFrame(y_test)

### Distilation 

In [9]:
#train = pd.read_csv('Train.csv')
train = pd.concat((train, teacher_train), 1)

#test = pd.read_csv('Test.csv')
test = pd.concat((test, teacher_test), 1)

In [10]:
# Define Teacher Label
df_teacher = pd.concat((train, test))

In [11]:
# Teacher latent representation
teacher_label = df_teacher.drop(['ID', 'Text', 'Label'], 1) 

In [12]:
# Define Student Model
student = Pipeline([('vectorizer', TfidfVectorizer()), ('ridge', MultiOutputRegressor(Ridge()))])
# train student
student.fit(df_teacher['Text'], teacher_label)

Pipeline(memory=None,
         steps=[('vectorizer',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('ridge',
                 MultiOutputRegressor(estimator=Ridge(alpha=1.0, copy_X=True,
                                                      fit_inte

In [13]:
!ls

DNN_test.csv		 drive			 mlp_truncated_train.csv
DNN_train.csv		 mlp_test.csv		 sample_data
DNN_truncated_test.csv	 mlp_train.csv
DNN_truncated_train.csv  mlp_truncated_test.csv


In [15]:
%cd /content/drive/MyDrive/Code Review /

/content/drive/MyDrive/Code Review 


In [21]:
pickle.dump(student, open('vectorizer_v2.pkl', 'wb'))

In [None]:
https://drive.google.com/file/d//view?usp=sharing