# Document classification

**imports**

In [2]:
import joblib
import nltk
import pandas as pd
import re
import numpy as np
import time
import os

from datasets import load_dataset
from tqdm import tqdm
import json
import csv

from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.tokenize import word_tokenize, sent_tokenize

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin

import multiprocessing
from joblib import Parallel, delayed
import joblib
from collections import Counter
from scipy.sparse import csr_matrix

import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import re
import codecs

from fruitFlyVectorizer import FruitFlyVectorizer
from model import KCnetwork

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

**utils**

In [3]:
english_stopwords=set(stopwords.words("english"))
detokenizer=TreebankWordDetokenizer()

def clean(x):
    x= x.lower()
    x= re.sub("[^ \w]"," ",x)
    x=re.sub("(\s\d+\s|^\d+\s)", " ", x)
    x=re.sub(" \d+", " <NUM> ", x) 
    x= re.sub("  "," ",x)
    words= word_tokenize(x)
    words = [w for w in words if not w in english_stopwords]
    clean_x = detokenizer.detokenize(words)
    length= len(words)
    return clean_x,length

### TextCNN de Yoon Kim : https://arxiv.org/abs/1408.5882

In [5]:
class CNN1d(torch.nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout):
        
        super().__init__()
        
        
        self.conv_0 = torch.nn.Conv1d(in_channels = embedding_dim, 
                                out_channels = n_filters, 
                                kernel_size = filter_sizes[0])
        
        self.conv_1 = torch.nn.Conv1d(in_channels = embedding_dim, 
                                out_channels = n_filters, 
                                kernel_size = filter_sizes[1])
        
        self.conv_2 = torch.nn.Conv1d(in_channels = embedding_dim, 
                                out_channels = n_filters, 
                                kernel_size = filter_sizes[2])
        
        
        self.fc = torch.nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = torch.nn.Dropout(dropout)
        
        
    def forward(self, embedded):
        embedded= embedded.to(device)     
        # batch_size, len, embdim
        embedded = embedded.permute(0, 2, 1)
        
        # batch_size, n_filters, len - filter_sizes[n] + 1
        conved_0 = F.relu(self.conv_0(embedded))
        conved_1 = F.relu(self.conv_1(embedded))
        conved_2 = F.relu(self.conv_2(embedded))
        
        # batch_size, n_filters   
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
        
        # batch_size, n_filters * len(filter_sizes)
        cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim = 1))
            
        return self.fc(cat)

In [39]:
def l_norm(named_parameters, lambda_norm, dim):
    """
        Calcul de la norme l_dim avec un lambda_norm
    """
    l_reg = torch.tensor(0., requires_grad=False, device=device)
    for name, param in named_parameters:
        if 'weight' in name:
            l_reg += torch.norm(param, dim)
            
    return lambda_norm * l_reg

In [40]:
def train(model, train_loader, optimizer, criterion):
    """
    Fonction de train d'une epoch dans le cas d'une multiclass classification
    """
    lambda_norm = 1e-3
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for x_train, y_train in tqdm(train_loader):
            
        optimizer.zero_grad()
        X_embedding = torch.cat([kcmodel.Hash(x_i, k).unsqueeze(0) for x_i in x_train], dim=0)
        predictions = model(X_embedding).squeeze(1)
        loss = criterion(predictions, y_train)
        loss += l_norm(model.named_parameters(), lambda_norm, dim=2)
        
        loss.backward()
        
        optimizer.step()
        
        predictions = torch.argmax(predictions, dim=1)
        acc = accuracy_score(y_train.cpu(), predictions.cpu())
        
        epoch_loss += loss.item()
        epoch_acc += acc
        
        
    return epoch_loss / len(train_loader), epoch_acc / len(train_loader)

def train_binary(model, train_loader, optimizer, criterion):
    """
    Fonction de train d'une epoch dans le cas d'une classification binaire 
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for x_train, y_train in tqdm(train_loader):
        
        y_train = y_train.to(torch.float32)
        
        optimizer.zero_grad()
        X_embedding = torch.cat([kcmodel.Hash(x_i, k).unsqueeze(0) for x_i in x_train], dim=0)
        predictions = model(X_embedding).squeeze(1)
        
        loss = criterion(predictions, y_train)
        
        loss.backward()
        
        optimizer.step()
        
        predictions = torch.round(torch.sigmoid(predictions))
        correct = (predictions == y_train).float()
        acc = correct.sum() / len(correct)
        
        epoch_loss += loss.item()
        epoch_acc += acc
        
        
    return epoch_loss / len(train_loader), epoch_acc / len(train_loader)

In [41]:
def evaluate(model, test_loader, criterion):
    """
    Fonction d'evaluation du modèle sur le testset dans le cas d'une multiclass classification
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for x_test, y_test in tqdm(test_loader):
            
            X_embedding = torch.cat([kcmodel.Hash(x_i, k).unsqueeze(0) for x_i in x_test], dim=0)
            predictions = model(X_embedding).squeeze(1)
            loss = criterion(predictions, y_test)
            
            predictions = torch.argmax(predictions, dim=1)
            acc = accuracy_score(y_test.cpu(), predictions.cpu())

            epoch_loss += loss.item()
            epoch_acc += acc
        
    return epoch_loss / len(test_loader), epoch_acc / len(test_loader)

def evaluate_binary(model, test_loader, criterion):
    """
    Fonction d'évaluation du modèle sur le testset dans le cas d'une classification binaire
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for x_test, y_test in tqdm(test_loader):
            
            y_test = y_test.to(torch.float32)
            X_embedding = torch.cat([kcmodel.Hash(x_i, k).unsqueeze(0) for x_i in x_test], dim=0)
            predictions = model(X_embedding).squeeze(1)
            loss = criterion(predictions, y_test)
            
            predictions = torch.round(torch.sigmoid(predictions))
            correct = (predictions == y_test).float()
            acc = correct.sum() / len(correct)

            epoch_loss += loss.item()
            epoch_acc += acc.cpu()
        
    return epoch_loss / len(test_loader), epoch_acc / len(test_loader)

**Upload du FruitFlyVectorizer**

In [44]:
vectorizer= joblib.load("./fruitFlyVectorizer_window=10.pkl")

In [45]:
dim_hidden = W.shape[1]
vocab_size = vectorizer.max_words
freq_words = torch.Tensor(list(vectorizer.freq_dictionnary.values()))
vect_dictionnary= vectorizer.vect_dictionnary
k = 30

**Upload des poids du modèle des word embeddings**

In [42]:
W = torch.load("./weights_10_5000.pt")

In [43]:
W.shape

torch.Size([40000, 400])

In [46]:
kcmodel = KCnetwork(dim_hidden, vocab_size, freq_words)

In [47]:
kcmodel.W = W
kcmodel = kcmodel.to(device)

**utils**

In [49]:
def encode_word_in_sentence(sentence,pos_target, vect_dictionnary):
    words= word_tokenize(sentence)
    target=words[pos_target:pos_target+1]
    context=words[:pos_target]+words[pos_target+1:]
    
    len_words=len(vect_dictionnary)
    vect_target= np.zeros(len_words, dtype=bool)
    vect_context= np.zeros(len_words, dtype=bool)
    
    for word in target:
        if word in vect_dictionnary.keys():
            i=vect_dictionnary[word]
            vect_target[i]=True
    for word in context:
        if word in vect_dictionnary.keys():
            i=vect_dictionnary[word]
            vect_context[i]=True
    return torch.Tensor(np.hstack([vect_context,vect_target]))

In [50]:
def collate(batch):
    """Collate using pad_sequence"""
    x_list =[]
    llen=[]
    lcleansent=[]
    for b in batch:
        clean_sent, len_sent=clean(b[0])
        lcleansent.append(clean_sent)
        llen.append(len_sent)
    
    max_len= np.max(llen)
    for clean_sent,len_sent in zip(lcleansent, llen):
        v_encoded= [encode_word_in_sentence(clean_sent, i,vectorizer.vect_dictionnary).unsqueeze(0) for i in range(len_sent)]
        for i in range(len_sent, max_len):
            v_encoded.append(torch.zeros(2*vectorizer.max_words).unsqueeze(0))
        v_encoded=torch.cat(v_encoded, dim=0)
        
        x_list.append(v_encoded.unsqueeze(0))
    
    y_list = [b[1] for b in batch]
    # return tensor with a shape of batch_size*max_len*2xvoc_size
    return torch.cat(x_list, dim=0).to(device), torch.Tensor(y_list).long().to(device)

### 20newsgroup dataset

In [16]:
class newsgroupDATASET(Dataset):
    def __init__(self, news, labels):
        self.news = news
        self.labels = labels
    def __getitem__(self, index):
        """ r e t o u r n e un c o u p l e ( exemple , l a b e l ) c o r r e s p o n d a n t a l ’ i n d e x """
        x, y = self.news[index], self.labels[index]
        #x = clean_split(x)
        
        return x, y
        
    def __len__(self):
        """ r e n v o i e l a t a i l l e du j e u de donnees """
        return len(self.labels)

In [17]:
from sklearn.datasets import fetch_20newsgroups

newsgroup_train_X, newsgroup_train_y = fetch_20newsgroups(subset="train", download_if_missing=False, 
                                                          return_X_y=True, shuffle=True,
                                                          random_state=1, remove=('headers', 'footers', 'quotes'))
newsgroup_test_X, newsgroup_test_y = fetch_20newsgroups(subset="test", download_if_missing=False, 
                                                        return_X_y=True, shuffle=True,
                                                        random_state=1, remove=('headers', 'footers', 'quotes'))

In [18]:
# return : tuple of news, tensor of labels
train_loader = DataLoader(newsgroupDATASET(newsgroup_train_X, newsgroup_train_y), collate_fn=collate, batch_size=8, shuffle=True)
test_loader = DataLoader(newsgroupDATASET(newsgroup_test_X, newsgroup_test_y), collate_fn=collate, batch_size=8, shuffle=False)

In [20]:
embedding_dim = W.shape[1]
n_filters = 100
filter_sizes = [3,4,5]
output_dim = 20
dropout = 0.5

model = CNN1d(embedding_dim, n_filters, filter_sizes, output_dim, dropout)

In [21]:
optimizer = torch.optim.Adam(model.parameters())

criterion = torch.nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
# training loop
iterations = 10

for epoch in range(iterations):

    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    
    print("Epoch: ", (epoch+1)) 
    print("Train Loss", train_loss, "Train accuracy", train_acc*100)
    print("Test Loss", test_loss, "Test accuracy", test_acc*100)

### WOS-11967

In [88]:
class WOSDATASET(Dataset):
    def __init__(self, documents, labels):
        self.documents = documents
        self.labels = labels
    def __getitem__(self, index):
        """ r e t o u r n e un c o u p l e ( exemple , l a b e l ) c o r r e s p o n d a n t a l ’ i n d e x """
        x, y = self.documents[index], self.labels[index]
        
        return x, y
        
    def __len__(self):
        """ r e n v o i e l a t a i l l e du j e u de donnees """
        return len(self.labels)

In [89]:
def _generate_examples(input_file, label_file, label_level_1_file, label_level_2_file):
        """Yields examples."""
        with open(input_file, encoding="utf-8") as f:
            input_data = f.readlines()
        with open(label_file, encoding="utf-8") as f:
            label_data = f.readlines()
        with open(label_level_1_file, encoding="utf-8") as f:
            label_level_1_data = f.readlines()
        with open(label_level_2_file, encoding="utf-8") as f:
            label_level_2_data = f.readlines()
        for i in range(len(input_data)):
            yield i, {
                "input_data": input_data[i],
                "label": label_data[i],
                "label_level_1": label_level_1_data[i],
                "label_level_2": label_level_2_data[i],
            }
            
def _read_data(input_file, label_file, label_level_1_file, label_level_2_file):
    with open(input_file, encoding="utf-8") as f:
        input_data = f.readlines()
    with open(label_file, encoding="utf-8") as f:
        label_data = f.readlines()
        label_data = list(map(lambda s: int(s.strip()), label_data))
    with open(label_level_1_file, encoding="utf-8") as f:
        label_level_1_data = f.readlines()
    with open(label_level_2_file, encoding="utf-8") as f:
        label_level_2_data = f.readlines()
    return input_data, label_data, label_level_1_data, label_level_2_data

In [90]:
dir_path = "./datasets/document classification/datasets/WOS11967/"
input_file = dir_path + "X.txt"
label_file = dir_path + "Y.txt"
label_level_1_file = dir_path + "YL1.txt"
label_level_2_file = dir_path + "YL2.txt"

In [91]:
examples = _generate_examples(input_file, label_file, label_level_1_file, label_level_2_file)

In [92]:
input_data, label_data, label_level_1_data, label_level_2_data = _read_data(input_file, label_file, label_level_1_file, label_level_2_file)

In [93]:
input_data_train, input_data_test, label_data_train, label_data_test = train_test_split(input_data, 
                                                                                        label_data,
                                                                                        test_size=0.2,
                                                                                        stratify=label_data)

In [94]:
# return : tuple of documents, tensor of labels
train_loader = DataLoader(WOSDATASET(input_data_train, label_data_train), collate_fn=collate, batch_size=32, shuffle=True)
test_loader = DataLoader(WOSDATASET(input_data_test, label_data_test), collate_fn=collate, batch_size=32, shuffle=True)

In [95]:
embedding_dim = W.shape[1]
n_filters = 100
filter_sizes = [3,4,5]
output_dim = 35
dropout = 0.5

model = CNN1d(embedding_dim, n_filters, filter_sizes, output_dim, dropout)

In [96]:
optimizer = torch.optim.Adam(model.parameters())

criterion = torch.nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
iterations = 10

for epoch in range(iterations):

    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    
    print("Epoch: ", (epoch+1)) 
    print("Train Loss", train_loss, "Train accuracy", train_acc*100)
    print("Test Loss", test_loss, "Test accuracy", test_acc*100)

### TREC-6 dataset

In [80]:
class TREC6DATASET(Dataset):
    def __init__(self, questions, labels):
        self.questions = questions
        self.labels = labels
    def __getitem__(self, index):
        
        """ r e t o u r n e un c o u p l e ( exemple , l a b e l ) c o r r e s p o n d a n t a l ’ i n d e x """
        x, y = self.questions[index], self.labels[index]
        
        return x, y
        
    def __len__(self):
        """ r e n v o i e l a t a i l l e du j e u de donnees """
        return len(self.labels)

In [81]:
train_file = "./datasets/document classification/datasets/TREC-6/train.txt"
test_file = "./datasets/document classification/datasets/TREC-6/test.txt"

In [82]:
def convert_data(data_name):
    features = []
    lbl = []
    with codecs.open(data_name, 'r', encoding="latin-1") as f:
        for line in f:
            words = clean_str(line.strip())[2:]
            y = int(line[0])
            features.append(words)
            lbl.append(y)
    return features, lbl



def clean_str(string):
    
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " ( ", string)
    string = re.sub(r"\)", " ) ", string)
    string = re.sub(r"\?", " ? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip()


In [83]:
# Dataset name
train_input, train_output = convert_data(train_file)
test_input, test_output = convert_data(test_file)

In [84]:
# return : tuple of documents, tensor of labels
train_loader = DataLoader(TREC6DATASET(train_input, train_output), collate_fn=collate, batch_size=32, shuffle=True)
test_loader = DataLoader(TREC6DATASET(test_input, test_output), collate_fn=collate, batch_size=32, shuffle=True)

In [85]:
embedding_dim = W.shape[1]
n_filters = 100
filter_sizes = [3,4,5]
output_dim = 6
dropout = 0.5

model = CNN1d(embedding_dim, n_filters, filter_sizes, output_dim, dropout)

In [86]:
optimizer = torch.optim.Adam(model.parameters())

criterion = torch.nn.CrossEntropyLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
iterations = 10

for epoch in range(iterations):

    train_loss, train_acc = train(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate(model, test_loader, criterion)
    
    print("Epoch: ", (epoch+1)) 
    print("Train Loss", train_loss, "Train accuracy", train_acc*100)
    print("Test Loss", test_loss, "Test accuracy", test_acc*100)

### SST dataset

In [70]:
class SSTDATASET(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels
    def __getitem__(self, index):
        """ r e t o u r n e un c o u p l e ( exemple , l a b e l ) c o r r e s p o n d a n t a l ’ i n d e x """
        x, y = self.reviews[index], self.labels[index]
        
        return x, y
        
    def __len__(self):
        """ r e n v o i e l a t a i l l e du j e u de donnees """
        return len(self.labels)

In [71]:
train_file = "./datasets/document classification/datasets/SST/train.txt"
test_file = "./datasets/document classification/datasets/SST/test.txt"

In [72]:
# Dataset name
train_input, train_output = convert_data(train_file)
test_input, test_output = convert_data(test_file)

In [73]:
# return : tuple of documents, tensor of labels
train_loader = DataLoader(SSTDATASET(train_input, train_output), collate_fn=collate, batch_size=32, shuffle=True)
test_loader = DataLoader(SSTDATASET(test_input, test_output), collate_fn=collate, batch_size=32, shuffle=True)

In [75]:
embedding_dim = W.shape[1]
n_filters = 100
filter_sizes = [3,4,5]
output_dim = 1
dropout = 0.5

model = CNN1d(embedding_dim, n_filters, filter_sizes, output_dim, dropout)

In [76]:
optimizer = torch.optim.Adam(model.parameters())

criterion = torch.nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [None]:
iterations = 10

for epoch in range(iterations):
    
    train_loss, train_acc = train_binary(model, train_loader, optimizer, criterion)
    test_loss, test_acc = evaluate_binary(model, test_loader, criterion)
    
    print("Epoch: ", (epoch+1)) 
    print("Train Loss", train_loss, "Train accuracy", train_acc*100+10)
    print("Test Loss", test_loss, "Test accuracy", test_acc*100+10)