# Authors
- Eliot LECLAIR
- Alex POIRON
- Tom THIL
- Aurélien VISENTIN
# NPL (No Deep) - Lab2

In [1]:
# Installing datasets and nltk libraries
!pip install datasets nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
     ---------------------------------------- 1.5/1.5 MB 19.0 MB/s eta 0:00:00
Collecting regex>=2021.8.3
  Downloading regex-2022.9.13-cp37-cp37m-win_amd64.whl (267 kB)
     ------------------------------------- 268.0/268.0 kB 17.2 MB/s eta 0:00:00
Installing collected packages: regex, nltk
Successfully installed nltk-3.7 regex-2022.9.13


## Imports

In [38]:
import re
import string
import numpy as np
import random

#Import for typing
from typing import Callable, List, Tuple, Dict

# Imports from datasets
from datasets import load_dataset
from datasets import load_dataset_builder

# Imports from nltk
import nltk
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Imports from Sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

# We need to download a package for word tokenization
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\Alex
[nltk_data]     POIRON\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Alex
[nltk_data]     POIRON\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## The Dataset
### Description

In [3]:
#Loading the data and it description
ds_builder = load_dataset_builder("imdb")

description = ds_builder.info.description
print(description)

Large Movie Review Dataset.
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well.


In [4]:
# Get the features with the number and names of classes
ds_builder.info.features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None)}

### Splits

In [5]:
ds_builder.info.splits

{'train': SplitInfo(name='train', num_bytes=33432823, num_examples=25000, dataset_name='imdb'),
 'test': SplitInfo(name='test', num_bytes=32650685, num_examples=25000, dataset_name='imdb'),
 'unsupervised': SplitInfo(name='unsupervised', num_bytes=67106794, num_examples=50000, dataset_name='imdb')}

There are 3 splits in this dataset :
- **train** : 25.000 entries
- **test** : 25.000 entries
- **unsupervised** : 50.000 entries 

### Classes representation

In [6]:
import datasets
def get_ratio(splited: datasets.dataset_dict.DatasetDict) -> None:
    """
    Print the number of negative and positive values that correspond
    and their ratio.
    Args:
        splited: Train or Test split of IMDB Dataset
    """
    
    classes = splited['label']
    lg = len(classes)
    neg = classes.count(0)
    pos = classes.count(1)
    ratio_neg = neg / lg * 100
    ratio_pos = pos / lg * 100

    print("Negative values :", neg)
    print("Positive values :", pos)
    print("We have", ratio_neg, "% of neg and", ratio_pos, "% of pos")
    print("----------")

In [7]:
train = load_dataset('imdb', split="train")
test = load_dataset('imdb', split="test")
get_ratio(train)
get_ratio(test)

Found cached dataset imdb (C:/Users/Alex POIRON/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)
Found cached dataset imdb (C:/Users/Alex POIRON/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


Negative values : 12500
Positive values : 12500
We have 50.0 % of neg and 50.0 % of pos
----------
Negative values : 12500
Positive values : 12500
We have 50.0 % of neg and 50.0 % of pos
----------


## Naive Bayes classifier
### Pre-processing

In [10]:
import datasets
def pre_processing(example: datasets.arrow_dataset.Example) -> datasets.arrow_dataset.Example:
    """
    Returns a dataset's element with the text changed (punctuation deleted and applied lowercase). This function is
    used in a map() to apply it on the whole Dataset we want.
    Args: 
        example: represents an element in the train or test dataset.
    Returns: 
        example: the element that we modified.
    """
    
    example['text'] = example['text'].translate(str.maketrans('', '', string.punctuation))        
    example['text'] = example['text'].lower()    
    return example

In [11]:
# Pretreatment on the train and test dataset
train_clean = train.map(pre_processing)
test_clean = test.map(pre_processing)

  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

### Multinomial Naive Bayes
We will use from Sklearn a Pipeline with a CountVectorizer and MultinomialNB classifier.

In [13]:
# We create the pipeline combining the countVectorizer and the
# Multinomial Naive Bayes
pipe = Pipeline(steps=[
    ('select', CountVectorizer()),
    ('clf', MultinomialNB())])

# Fitting with the train dataset that has been pre-processed.
pipe.fit(train_clean['text'], train_clean['label'])

Pipeline(steps=[('select', CountVectorizer()), ('clf', MultinomialNB())])

### Accuracy

In [14]:
random.seed(667)
import datasets
def get_accuracy(train : datasets.dataset_dict.DatasetDict,
                 test : datasets.dataset_dict.DatasetDict) -> list:
    """
    Get the accuracy score on both train and test dataset and print it and retrieve a list
    of wrong classified.
    Args:
        train: the train dataset
        test: the test dataset
    Returns: 
        wrong_classified : the array of element misclassified.
    """
    
    # Create predictions for both train and test datasets
    y_pred_test = pipe.predict(test['text'])
    y_pred_train = pipe.predict(train['text'])

    # Store true values
    y_true_test = test['label']
    y_true_train = train['label']

    # Retrive misclassification index exemple
    wrong_classified_index = [index for index, (index_true, index_pred) in enumerate(zip(y_true_test, y_pred_test)) if index_true != index_pred]

    wrong_classified = [(test['text'][index],test['label'][index]) for index in random.sample(wrong_classified_index,2)]

    # Compute the accuracy score using sklearn metrics module
    test_acc = accuracy_score(y_true_test, y_pred_test)
    train_acc = accuracy_score(y_true_train, y_pred_train)

    print("Train Accuracy : " + str(train_acc))
    print("Test Accuracy : " + str(test_acc))

    return wrong_classified

wrong_classified = get_accuracy(train_clean, test_clean)

Train Accuracy : 0.91284
Test Accuracy : 0.8172


### Wrongly classified

In [16]:
wrong_classified

[('yep edward g gives us a retro view of the criminal defense world first hes an overzealous prosecutor who sends the wrong man to the chair played passionately albeit briefly by deforrest kelly then hes so filled with remorse his only solace is the bottle throw in a jaded romance a genuinely rapid descent into penury and no qualms about who he defends and next thing you know  shazam black leg lawyer god i love that phrase he sees the light just in time to save his jaded beloved from the chair yawnbr br but really the courtroom action is pure melodrama see him punch out a witness see him drink poison see him argue passionately as he clutches a bullet hole in his breast be prepared for melodramabr br the hoot of the film though is jayne russell with curves defying the laws of gravity and an iq approached absolute zero she is something to see even sings a bit',
  0),
 ('king vladislav angus scrimm of romania is a vampire but a vampire of light who wants nothing more than to live in peace

In the first example, the user uses sarcasm to review the film, which is difficult to interpret for the model. Moreover, he also gives some positive reviews which can distort the result a bit.

The second one is more complicated to interpret, its construction is more complex and does not only consist in giving its opinion but also its feeling and its expectations before seeing the film, which the model has difficulty in perceiving.

### Bonus Question : Top 10 most important words

In [13]:
# We retrieve log probability of each word for negative and positive class and sort them

neg_class_prob_sorted = pipe.named_steps['clf'].feature_log_prob_[0, :].argsort()[::-1]
pos_class_prob_sorted = pipe.named_steps['clf'].feature_log_prob_[1, :].argsort()[::-1]

# We associated word to it's feature

top_neg_word = np.take(pipe.named_steps['select'].get_feature_names_out(), neg_class_prob_sorted)
top_pos_word = np.take(pipe.named_steps['select'].get_feature_names_out(), pos_class_prob_sorted)

print("Top most important word for negative review (no filter):")
print(top_neg_word[:10])
print("Top most important word for positive review (no filter):")
print(top_pos_word[:10])

# We use nltk stopwords to remove them from our ranking
stopWords = set(stopwords.words('english'))
# We add 'br' which is a tag for html 
stopWords.add('br')

top_neg_word_filtered = [word for word in top_neg_word if word not in stopWords]
top_pos_word_filtered = [word for word in top_pos_word if word not in stopWords]

print("Top most important word for negative review (filtered):")
print(top_neg_word_filtered[:10])
print("Top most important word for positive review (filtered):")
print(top_pos_word_filtered[:10])

Top most important word for negative review (no filter):
['the' 'and' 'of' 'to' 'is' 'in' 'this' 'it' 'that' 'br']
Top most important word for positive review (no filter):
['the' 'and' 'of' 'to' 'is' 'in' 'it' 'this' 'that' 'br']
Top most important word for negative review (filtered):
['movie', 'film', 'one', 'like', 'even', 'good', 'bad', 'would', 'really', 'time']
Top most important word for positive review (filtered):
['film', 'movie', 'one', 'like', 'good', 'story', 'great', 'time', 'see', 'well']


## Stemming
### Add stemming

In [17]:
import datasets

def add_stemming(example: datasets.arrow_dataset.Example) -> datasets.arrow_dataset.Example:
    """
    Returns a dataset's element with the text changed (using stemming this time). This function is
    used in a map() to apply it on the whole Dataset we want.
    Args: 
        example: represents an element in the train or test dataset.
    Returns:
        example: the element that we modified.
    """
    
    re_word = re.compile(r"^\w+$")
    stemmer = SnowballStemmer("english")

    stemmed = [stemmer.stem(word) for word in word_tokenize(example['text'].lower()) if re_word.match(word)]
    example['text'] = " ".join(stemmed)

    return example

In [18]:
train_stemming = train.map(add_stemming)
test_stemming = test.map(add_stemming)

  0%|          | 0/25000 [00:00<?, ?ex/s]

  0%|          | 0/25000 [00:00<?, ?ex/s]

### Multinomial Bayes with Stemming

In [19]:
# Fitting the Pipeline with the new pretreatment on the train dataset
pipe.fit(train_stemming['text'], train_stemming['label'])

# Getting the new accuracy
get_accuracy(train_stemming, test_stemming)
print("\nOld accuracy : \n\tTrain Accuracy : 0.91284 \n\tTest Accuracy : 0.8172")

Train Accuracy : 0.8828
Test Accuracy : 0.80516

Old accuracy : 
	Train Accuracy : 0.91284 
	Test Accuracy : 0.8172


# NLP (No Deep) - Lab 3

In [20]:
%%bash

git clone https://github.com/facebookresearch/fastText.git
cd fastText
pip install .

Processing /mnt/c/Users/Alex POIRON/Desktop/fastText
Collecting pybind11>=2.2
  Using cached pybind11-2.10.0-py3-none-any.whl (213 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py): started
  Building wheel for fasttext (setup.py): finished with status 'done'
  Created wheel for fasttext: filename=fasttext-0.9.2-cp39-cp39-linux_x86_64.whl size=4309447 sha256=fcfadd01b9a15061376e09148d0ff6c8f4fe5d64c4e07c388b08efc810e14dde
  Stored in directory: /tmp/pip-ephem-wheel-cache-c66srbi2/wheels/f4/cc/e0/f9c427538c6dda1b634adf4bac23297916cbe58d192280f03d
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.10.0


Cloning into 'fastText'...
Updating files: 100% (526/526), done.


## FastText
### 1) Datasets conversion and pretreatment

In [23]:
import datasets

def datasets_conversion(dataset: datasets.arrow_dataset.Dataset)-> list:
    """
    Returns a string's list where each element is in the good format for FastText.
    Args: 
        dataset: the dataset we want to convert.
    Returns: 
        dataset_to_string : A list containing the whole dataset in string elements.
    """
    dataset_to_string = []
    
    for element in dataset :
        if element['label']:
            label = "positive"
        else:
            label = "negative"
        
        new_line = "__label__" + label + " " + element['text']
        dataset_to_string.append(new_line)
    
    return dataset_to_string

def create_file(train: list, test: list)-> None:
    """
    Create two files that correspond to the allowed format for FastText for both train and test datasets.
    Args: 
        train: represents the string's list of the train dataset.
        test: represents the string's list of the test dataset.
    """
    train_text_file = open("train_to_file", "w",  encoding="utf-8")
    test_text_file = open("test_to_file", "w",  encoding="utf-8")
    for line in train:
        train_text_file.write(line + "\n")

    for line in test:
        test_text_file.write(line + "\n")

    train_text_file.close()
    test_text_file.close()
    
#Convert the two datasets that has been pretreated (lower case and punctuation deleted)
train_to_string = datasets_conversion(train_clean)
test_to_string = datasets_conversion(test_clean)

#Create files
create_file(train_to_string, test_to_string)

In [25]:
#Little check to see if the convertion is correct
check = open("train_to_file", "r",  encoding="utf-8")
lines = check.readlines()
for i in range(5):
    print(lines[i])
check.close()

__label__negative i rented i am curiousyellow from my video store because of all the controversy that surrounded it when it was first released in 1967 i also heard that at first it was seized by us customs if it ever tried to enter this country therefore being a fan of films considered controversial i really had to see this for myselfbr br the plot is centered around a young swedish drama student named lena who wants to learn everything she can about life in particular she wants to focus her attentions to making some sort of documentary on what the average swede thought about certain political issues such as the vietnam war and race issues in the united states in between asking politicians and ordinary denizens of stockholm about their opinions on politics she has sex with her drama teacher classmates and married menbr br what kills me about i am curiousyellow is that 40 years ago this was considered pornographic really the sex and nudity scenes are few and far between even then its no

### 2) Default model Classifier

In [26]:
import fasttext

default_model = fasttext.train_supervised(input="train_to_file")
default_model.test("test_to_file")

(25000, 0.87804, 0.87804)

### 3) Hyperparameters search

In [27]:
from sklearn.model_selection import train_test_split

#Split the train dataset in 2 dictionnaries : train and validation
train_splited, validation_splited = train_test_split(train_clean, shuffle=True)

In [28]:
def dict_convertion(dict: Dict)-> list:
    """
    Returns a string's list where each element is in the good format for FastText. Same function as dataset_convertion
    but to convert instead a dictionnary.
    Args: 
        dict: the dictionnary we want to convert.
    Returns:
        dict_to_string: A list containing the whole dataset in string elements.
    """
    dict_to_string = []
    
    for i in range(len(dict['text'])):
        if dict['label'][i]:
            label = "positive"
        else:
            label = "negative"
        
        new_line = "__label__" + label + " " + dict['text'][i]
        dict_to_string.append(new_line)
    
    return dict_to_string

In [29]:
#Create new files for the splited train and validation dictionnaries both from the original train dataset.

#Here, test_to_file contains in fact the validation dataset convert in the string format.
create_file(dict_convertion(train_splited), dict_convertion(validation_splited))

In [30]:
#Model with validation dataset and hyperparameters search
hyperparameter_model = fasttext.train_supervised(input='train_to_file', autotuneValidationFile='test_to_file')

In [31]:
#We have to again erase our test_to_file to put inside the original test dataset
create_file(datasets_conversion(train_clean),datasets_conversion(test_clean))

#Get precision on this second model with the hyperparameter search
hyperparameter_model.test("test_to_file")

(25000, 0.89468, 0.89468)

### 4) Differences
Major differences between the **default** model and the **hyperparameters** model
- **EPOCHS** : 5 vs 100
- **Learning Rate** : 0.1 vs 0.04

### 5) Wrongly classified

In [35]:
random.seed(667)

def get_wrongly(test_clean: datasets.arrow_dataset.Dataset) -> list:
    """
    Get an array containing 2 elements misclassified choose randomly.
    Args:
        train: the train dataset
        test: the test dataset
    Returns: 
        wrong_classified : an array containing 2 elements misclassified choose randomly.
    """
    
    # Create predictions for both train and test datasets
    y_pred_test = hyperparameter_model.predict('test_to_file')

    # Store true values
    y_true_test = test_clean['label']

    # Retrive misclassification index exemple
    wrong_classified_index = [index for index, (index_true, index_pred) in enumerate(zip(y_true_test, y_pred_test)) if index_true != index_pred]

    wrong_classified = [(test_clean['text'][index],test_clean['label'][index]) for index in random.sample(wrong_classified_index,2)]
    return wrong_classified

wrong_classified = get_wrongly(test_clean)

In [36]:
wrong_classified

[('i love scifi and am willing to put up with a lot scifi moviestv are usually underfunded underappreciated and misunderstood i tried to like this i really did but it is to good tv scifi as babylon 5 is to star trek the original silly prosthetics cheap cardboard sets stilted dialogues cg that doesnt match the background and painfully onedimensional characters cannot be overcome with a scifi setting im sure there are those of you out there who think babylon 5 is good scifi tv its not its clichéd and uninspiring while us viewers might like emotion and character development scifi is a genre that does not take itself seriously cf star trek it may treat important issues yet not as a serious philosophy its really difficult to care about the characters here as they are not simply foolish just missing a spark of life their actions and reactions are wooden and predictable often painful to watch the makers of earth know its rubbish as they have to always say gene roddenberrys earth otherwise peo

> TODO: Bonus question 