#Loading Packages and Data 

In [0]:
# Permission to use the drive to extract data
from google.colab import drive
drive.mount('/content/gdrive/', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [0]:
!pip3 install tqdm --upgrade

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.43.0)


In [0]:
import numpy as np 
from tqdm.notebook import tqdm 
import pandas as pd 
from sklearn.preprocessing import OrdinalEncoder 

In [0]:
cd '/content/gdrive/My Drive/Altegrad'

/content/gdrive/My Drive/Altegrad


### Load Train Data

In [0]:
train_data = pd.read_csv('./embeds/' + 'train_noduplicates.csv', header = None)
train_data.columns = ['File', 'Type']

test_data = pd.read_csv('./embeds/' + 'test.csv', header = None)
test_data.columns = ['File']

enc = OrdinalEncoder()
X = train_data['Type']
labels = enc.fit_transform(np.array(X).reshape(-1,1))
train_data['Labels'] = labels

### Load Embeddings

In [0]:
path_to_data = './text/'

import pickle
Load_embegginds = False

if Load_embegginds == True:
  my_vectors = {}
  i = 0
  for line in open('./cc.fr.300.vec'):
      fields = line.strip().split(" ") 
      nom = fields[0].lower()
      if nom.isalpha():
          my_vectors[nom] = [float(v) for v in fields[1:]]

else:
  with open('./embeds/pickle_embed.pickle', 'rb') as handle:
    embeddings = pickle.load(handle)

### Load Vocab

In [0]:
with open('./embeds/vocab_clean.pickle', 'rb') as handle:
  vocab = pickle.load(handle)

In [0]:
len(vocab), len(embeddings)

(6960, 1142478)

In [0]:
def extract_file_embedding(file):
  file = './embeds/Vocab_occurences/pickles/' + file
  with open(file + '.pickle', 'rb') as handle:
    my_data = pickle.load(handle)

  w_c = max(vocab.values()) / max(my_data.values())
  common_vocab = {k: float(w_c * my_data[k]/vocab[k]) for k in my_data.keys() & vocab}

  embedding_words = {k: np.array(embeddings[k]) * common_vocab[k] for k in common_vocab.keys() & embeddings}

  vect_file = sum(embedding_words.values())

  return vect_file

In [0]:
test = extract_file_embedding('0')

In [0]:
train_embedding_doc = False

if train_embedding_doc == True:

  from os import listdir
  test = []
  vocab_embedding_docs = {}
  i = 0

  for file in tqdm(listdir('./embeds/Vocab_occurences/pickles')):
    file = file.split('.')[0]
    try:
      if (len(extract_file_embedding(file))) == 300:
        vocab_embedding_docs[file] = extract_file_embedding(file)
    except:
      vocab_embedding_docs[file] = np.array([0 for t in range(300)])
      test.append(file)
      pass

else:
  with open('./embeds/doc_vocab_embed.pickle', 'rb') as handle:
    vocab_embedding_docs = pickle.load(handle)

In [0]:
len(vocab_embedding_docs)

2555

In [0]:
my_list = []
X = []
y = []
for element in vocab_embedding_docs.keys():
  try:
    if len(vocab_embedding_docs[element]) == 300:
      y_t = train_data[train_data['File'] == int(element)]['Labels'].iloc[0]
      y.append(y_t)
      X.append(vocab_embedding_docs[element])
  except:
    my_list.append(element)

X = np.vstack(X)

y = np.array(y).reshape(-1, 1)
X.shape, y.shape


((1994, 300), (1994, 1))

# Predictions

### Using MLP

In [0]:
from sklearn.model_selection import train_test_split
from __future__ import print_function

import keras
from keras.datasets import mnist
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.optimizers import RMSprop

from keras.constraints import unit_norm

In [0]:
batch_size = 128
num_classes = 8
epochs = 100

In [0]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)


model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(300,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='tanh'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='tanh', kernel_constraint=unit_norm()))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu', bias = True))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

model.summary()

model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(),
              metrics=['accuracy'])

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_data=(x_test, y_test))
score = model.evaluate(x_test, y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [0]:
X_export = []
my_list = []
for element in test_data['File']:
  try:
    if len(vocab_embedding_docs[str(element)]) == 300:
      X_export.append(vocab_embedding_docs[str(element)])
  except:
    my_list.append(element)

X_export = np.vstack(X_export)

## Using LR

In [0]:
import logging
logging.getLogger("pytorch_transformers.tokenization_utils").setLevel(logging.ERROR)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

In [0]:
def loglikelihood_score(y_true, predictions, classes_order):
    dic = {v:k for k, v in enumerate(classes_order)}
    loss = 0
    for i, cls in enumerate(y_true) :
        loss -= np.log(predictions[i, dic[cls]])
    loss = loss/len(y_true)
    return loss

In [0]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [0]:
grid={"C":np.logspace(-1,3, num = 30)}

logreg = LogisticRegression(solver='lbfgs',  multi_class='auto', max_iter=25000, n_jobs=-1)

classes_order = LogisticRegression(solver='lbfgs',  multi_class='auto').fit(x_train[:, :2], y_train).classes_
score_function = make_scorer(loglikelihood_score, greater_is_better=False, classes_order=classes_order, needs_proba=True)

logreg_cv = GridSearchCV(logreg,grid,cv=3, verbose=3, n_jobs=-1, scoring=score_function)

logreg_cv.fit(x_train, y_train)

print(logreg_cv.best_params_)
print('Score on the local test : ', logreg_cv.best_score_)