In [6]:
# Pour installer jamspell
!sudo apt-get install swig3.0 -y
!pip install jamspell fasttext

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
swig3.0 is already the newest version (3.0.12-2.2ubuntu1).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [24]:
import pandas as pd
import numpy as np
import re
import spacy
from textblob import TextBlob

import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

import jamspell

from transformers import AutoTokenizer, AutoModel
import torch

import csv, fasttext

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [8]:
df=pd.read_csv('full_train.csv', index_col=0)

# Résultat Naïf

In [9]:
def process_csv(df, output_name, output_folder):
    df.loc[:, 'text'] = '__label__' + df['category'].astype(str) + ' ' + df['review']
    output_file = f'{output_folder}/{output_name}.txt'
    df[['text']].to_csv(output_file, index=False, header=False,
                quoting=csv.QUOTE_NONE,  quotechar="",  escapechar="\\")
    return output_file


def train_model(df, df_test, output_folder):
    output_file = process_csv(df, 'train', output_folder)
    output_file_test = process_csv(df_test, 'test', output_folder)
    model = fasttext.train_supervised(
        input=output_file, lr=0.1, epoch=20, wordNgrams=2)
    results = model.test(output_file_test)
    accuracy = results[1]
    return accuracy

In [14]:
train, test = train_test_split(df, test_size=0.2)

Prérequis : créer un dossier temporary pour y mettre les fichiers

In [16]:
train_model(train, test, './temporary')

0.619

Attention les labels ne sont pas forcément bons, et le modèle benchmark a un score à 0,6948

In [17]:
df.head(15)

Unnamed: 0,review,category
0,I like the shepherd! Sure the acting wasn ' t ...,0
1,"Girst off, I ' m an American - - I haven ' t s...",1
2,I'm a sucker for a decent superhero movie. (I'...,0
3,D4ad Gentlemen Productionx has put togethdr a ...,1
4,I have decided to flusJ this show from my m#mo...,1
5,Now we were chosen to be tortured with this di...,0
6,I reZlly likes thLs movi#. <br /> <br /> Every...,1
7,ThJs is not a movi# for fans of the uaual eerL...,1
8,"I ' m no h)rror movie buff, but my wife ' s ni...",0
9,"Okay, you Yippies are probably wondering what ...",0


# Text preprocessing

Le but ici est de nettoyer nos données textuelles pour consturire un modèle de NLP. Plusieurs types de défaut dans les données :
- fautes d'orthographe et de frappe
- caractères spéciaux et ponctuation
- balises HTML
- normalisation (minuscules/majuscules) : permet aussi de réduire la dimensionnalité des données

In [18]:
def clean_text(text):
    text = re.sub(r"<br\s*/?>", " ", text)  # Supprime les balises HTML
    text = re.sub(r"[^a-zA-Z0-9\s']", "", text).lower().strip()  # Supprime caractères spéciaux, convertit en minuscules, supprime espaces excédentaires
    return re.sub(r'\s+', ' ', text)  # Réduit les espaces multiples à un seul espace

df['cleaned_review'] = df['review'].apply(clean_text)
df.head()

Unnamed: 0,review,category,cleaned_review
0,I like the shepherd! Sure the acting wasn ' t ...,0,i like the shepherd sure the acting wasn ' t g...
1,"Girst off, I ' m an American - - I haven ' t s...",1,girst off i ' m an american i haven ' t sen an...
2,I'm a sucker for a decent superhero movie. (I'...,0,i'm a sucker for a decent superhero movie i'm ...
3,D4ad Gentlemen Productionx has put togethdr a ...,1,d4ad gentlemen productionx has put togethdr a ...
4,I have decided to flusJ this show from my m#mo...,1,i have decided to flusj this show from my mmor...


Utilisation de Jamspell pour corriger les textes : rapide et efficace

In [19]:
!wget https://github.com/bakwc/JamSpell-models/raw/master/en.tar.gz
!tar -xvf en.tar.gz

# Corrige le texte (faute orthographe etc, rapide par rapport à textblob)
corrector = jamspell.TSpellCorrector()
corrector.LoadLangModel('en.bin') # Peut changer ici

def correct_spelling_with_jamspell(text):
    return corrector.FixFragment(text)

df['corrected_review'] = df['cleaned_review'].apply(correct_spelling_with_jamspell)

--2024-04-02 18:04:26--  https://github.com/bakwc/JamSpell-models/raw/master/en.tar.gz
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/bakwc/JamSpell-models/master/en.tar.gz [following]
--2024-04-02 18:04:26--  https://raw.githubusercontent.com/bakwc/JamSpell-models/master/en.tar.gz
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 36611828 (35M) [application/octet-stream]
Saving to: ‘en.tar.gz’


2024-04-02 18:04:27 (109 MB/s) - ‘en.tar.gz’ saved [36611828/36611828]

en.bin


Lemmatisation : ramène les mots à leur forme radicale de base. Technique plus précise que le stemming.
Enlever les stops words qui sont non pertinents.

Il faut que les données lemmatisées soient en format texte brut pour l'embedding

In [20]:
nlp = spacy.load("en_core_web_sm")
stop_words = set(stopwords.words('english'))

def lemmatize_and_remove_stopwords(text):
    lemmatized_text = ""
    doc = nlp(text)
    for token in doc:
        if token.lemma_.lower() not in stop_words and token.lemma_ not in ("'", ",", ".", "[", "]"):
            lemmatized_text += token.lemma_ + " "
    return lemmatized_text.strip()

df['lemmatized_review'] = df['corrected_review'].apply(lemmatize_and_remove_stopwords)


In [21]:
df.head()

Unnamed: 0,review,category,cleaned_review,corrected_review,lemmatized_review
0,I like the shepherd! Sure the acting wasn ' t ...,0,i like the shepherd sure the acting wasn ' t g...,i like the shepherd sure the acting wasn ' t g...,like shepherd sure act good fight scene nic3 v...
1,"Girst off, I ' m an American - - I haven ' t s...",1,girst off i ' m an american i haven ' t sen an...,first off i ' m an american i haven ' t seen a...,first american see comment imdb series yet u v...
2,I'm a sucker for a decent superhero movie. (I'...,0,i'm a sucker for a decent superhero movie i'm ...,i'm a sucker for a decent superhero movie i'm ...,sucker decent superhero movie count super bug ...
3,D4ad Gentlemen Productionx has put togethdr a ...,1,d4ad gentlemen productionx has put togethdr a ...,d4ad gentlemen production has put together a f...,d4ad gentleman production put together film am...
4,I have decided to flusJ this show from my m#mo...,1,i have decided to flusj this show from my mmor...,i have decided to flush this show from my memo...,decide flush show memory toilet bad tv show in...


In [22]:
train, test = train_test_split(df, test_size=0.2)

In [23]:
accuracy_default_train, accuracy_default_test = train_test_split(df[['review', 'category']], test_size=0.2, random_state=0)
accuracy_cleaned_train, accuracy_cleaned_test = train_test_split(df[['cleaned_review', 'category']].copy(deep=True), test_size=0.2, random_state=0)
accuracy_corrected_train, accuracy_corrected_test = train_test_split(df[['corrected_review', 'category']].copy(deep=True), test_size=0.2, random_state=0)
accuracy_lemmatized_train, accuracy_lemmatized_test = train_test_split(df[['lemmatized_review', 'category']].copy(deep=True), test_size=0.2, random_state=0)

# Modifier nom des colonnes
accuracy_cleaned_train['review'] = accuracy_cleaned_train['cleaned_review']
accuracy_cleaned_test['review'] = accuracy_cleaned_test['cleaned_review']
accuracy_corrected_train['review'] = accuracy_corrected_train['corrected_review']
accuracy_corrected_test['review'] = accuracy_corrected_test['corrected_review']
accuracy_lemmatized_train['review'] = accuracy_lemmatized_train['lemmatized_review']
accuracy_lemmatized_test['review'] = accuracy_lemmatized_test['lemmatized_review']

accuracy_default = train_model(accuracy_default_train, accuracy_default_test, './temporary')
accuracy_cleaned = train_model(accuracy_cleaned_train, accuracy_cleaned_test, './temporary')
accuracy_corrected = train_model(accuracy_corrected_train, accuracy_corrected_test, './temporary')
accuracy_lemmatized = train_model(accuracy_lemmatized_train, accuracy_lemmatized_test, './temporary')

print('Default accuracy:', accuracy_default)
print('Cleaned text accuracy:', accuracy_cleaned)
print('Corrected text accuracy:', accuracy_corrected)
print('Lemmatized text accuracy:', accuracy_lemmatized)

Default accuracy: 0.5995
Cleaned text accuracy: 0.6105
Corrected text accuracy: 0.6275
Lemmatized text accuracy: 0.6355


# Améliorer les y

In [25]:
def penntreebank_to_wordnet(tag):
  if tag.startswith('J'):
      return wn.ADJ
  elif tag.startswith('N'):
      return wn.NOUN
  elif tag.startswith('R'):
      return wn.ADV
  elif tag.startswith('V'):
      return wn.VERB
  return None

def apply_tagging(text):
    token = nltk.word_tokenize(text)
    tagging = nltk.pos_tag(token)

    return tagging

df['tagged_review'] = df['corrected_review'].apply(apply_tagging)

In [26]:
df.head()

Unnamed: 0,review,category,cleaned_review,corrected_review,lemmatized_review,tagged_review
0,I like the shepherd! Sure the acting wasn ' t ...,0,i like the shepherd sure the acting wasn ' t g...,i like the shepherd sure the acting wasn ' t g...,like shepherd sure act good fight scene nic3 v...,"[(i, NN), (like, IN), (the, DT), (shepherd, NN..."
1,"Girst off, I ' m an American - - I haven ' t s...",1,girst off i ' m an american i haven ' t sen an...,first off i ' m an american i haven ' t seen a...,first american see comment imdb series yet u v...,"[(first, RB), (off, IN), (i, NN), (', ''), (m,..."
2,I'm a sucker for a decent superhero movie. (I'...,0,i'm a sucker for a decent superhero movie i'm ...,i'm a sucker for a decent superhero movie i'm ...,sucker decent superhero movie count super bug ...,"[(i, NN), ('m, VBP), (a, DT), (sucker, NN), (f..."
3,D4ad Gentlemen Productionx has put togethdr a ...,1,d4ad gentlemen productionx has put togethdr a ...,d4ad gentlemen production has put together a f...,d4ad gentleman production put together film am...,"[(d4ad, JJ), (gentlemen, NNS), (production, NN..."
4,I have decided to flusJ this show from my m#mo...,1,i have decided to flusj this show from my mmor...,i have decided to flush this show from my memo...,decide flush show memory toilet bad tv show in...,"[(i, NNS), (have, VBP), (decided, VBN), (to, T..."


Pour améliorer les y nous utilisons une méthode non supervisée qui se base sur le calcul du sentiment en sommant les sentiments positifs et en soustrayant les sentiments négatifs

In [27]:
# Lemmatizer global pour wordnet
lemmatizer = WordNetLemmatizer()

# Calcul du sentiment
def compute_sentiment(tags):
  sentiment = 0.0

  # Pour chaque tag, calculer le sentiment
  for word, tag in tags:
    # les convertir en tag wordnet
    wordnet_tag = penntreebank_to_wordnet(tag)

    # Garder seulement les mots avec un sentiment intéressant
    if wordnet_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
      continue

    # Lemmatization
    lemma = lemmatizer.lemmatize(word, pos=wordnet_tag)
    if not lemma:
      continue

    # Obtenir le synset
    synsets = wn.synsets(lemma, pos=wordnet_tag)
    if not synsets:
      continue

    # Prendre le premier sens (le plus commun)
    sentiwordnet_synset = swn.senti_synset(synsets[0].name())

    # Sommer les sentiments pour obtenir le sentiment global
    sentiment += sentiwordnet_synset.pos_score() - sentiwordnet_synset.neg_score()

  return sentiment

df['computed_sentiment'] = df['tagged_review'].apply(compute_sentiment)

In [28]:
df.head()

Unnamed: 0,review,category,cleaned_review,corrected_review,lemmatized_review,tagged_review,computed_sentiment
0,I like the shepherd! Sure the acting wasn ' t ...,0,i like the shepherd sure the acting wasn ' t g...,i like the shepherd sure the acting wasn ' t g...,like shepherd sure act good fight scene nic3 v...,"[(i, NN), (like, IN), (the, DT), (shepherd, NN...",2.25
1,"Girst off, I ' m an American - - I haven ' t s...",1,girst off i ' m an american i haven ' t sen an...,first off i ' m an american i haven ' t seen a...,first american see comment imdb series yet u v...,"[(first, RB), (off, IN), (i, NN), (', ''), (m,...",0.5
2,I'm a sucker for a decent superhero movie. (I'...,0,i'm a sucker for a decent superhero movie i'm ...,i'm a sucker for a decent superhero movie i'm ...,sucker decent superhero movie count super bug ...,"[(i, NN), ('m, VBP), (a, DT), (sucker, NN), (f...",1.75
3,D4ad Gentlemen Productionx has put togethdr a ...,1,d4ad gentlemen productionx has put togethdr a ...,d4ad gentlemen production has put together a f...,d4ad gentleman production put together film am...,"[(d4ad, JJ), (gentlemen, NNS), (production, NN...",0.112
4,I have decided to flusJ this show from my m#mo...,1,i have decided to flusj this show from my mmor...,i have decided to flush this show from my memo...,decide flush show memory toilet bad tv show in...,"[(i, NNS), (have, VBP), (decided, VBN), (to, T...",-1.25


In [29]:
df['new_y'] = df['computed_sentiment'] > 0
df['new_y'] = df['new_y'].replace({True: 1, False: 0})

In [30]:
df.head()

Unnamed: 0,review,category,cleaned_review,corrected_review,lemmatized_review,tagged_review,computed_sentiment,new_y
0,I like the shepherd! Sure the acting wasn ' t ...,0,i like the shepherd sure the acting wasn ' t g...,i like the shepherd sure the acting wasn ' t g...,like shepherd sure act good fight scene nic3 v...,"[(i, NN), (like, IN), (the, DT), (shepherd, NN...",2.25,1
1,"Girst off, I ' m an American - - I haven ' t s...",1,girst off i ' m an american i haven ' t sen an...,first off i ' m an american i haven ' t seen a...,first american see comment imdb series yet u v...,"[(first, RB), (off, IN), (i, NN), (', ''), (m,...",0.5,1
2,I'm a sucker for a decent superhero movie. (I'...,0,i'm a sucker for a decent superhero movie i'm ...,i'm a sucker for a decent superhero movie i'm ...,sucker decent superhero movie count super bug ...,"[(i, NN), ('m, VBP), (a, DT), (sucker, NN), (f...",1.75,1
3,D4ad Gentlemen Productionx has put togethdr a ...,1,d4ad gentlemen productionx has put togethdr a ...,d4ad gentlemen production has put together a f...,d4ad gentleman production put together film am...,"[(d4ad, JJ), (gentlemen, NNS), (production, NN...",0.112,1
4,I have decided to flusJ this show from my m#mo...,1,i have decided to flusj this show from my mmor...,i have decided to flush this show from my memo...,decide flush show memory toilet bad tv show in...,"[(i, NNS), (have, VBP), (decided, VBN), (to, T...",-1.25,0


Une fois les nouveaux labels obtenus, nous calculons l'accuracy de chaque étape pour s'assurer de l'amélioration du score après chaque traitement

In [31]:
train, test = train_test_split(df, test_size=0.2)

accuracy_default_train, accuracy_default_test = train_test_split(df[['review', 'category']], test_size=0.2, random_state=0)
accuracy_cleaned_train, accuracy_cleaned_test = train_test_split(df[['cleaned_review', 'category']].copy(deep=True), test_size=0.2, random_state=0)
accuracy_corrected_train, accuracy_corrected_test = train_test_split(df[['corrected_review', 'category']].copy(deep=True), test_size=0.2, random_state=0)
accuracy_lemmatized_train, accuracy_lemmatized_test = train_test_split(df[['lemmatized_review', 'category']].copy(deep=True), test_size=0.2, random_state=0)
accuracy_new_label_train, accuracy_new_label_test = train_test_split(df[['lemmatized_review', 'new_y']].copy(deep=True), test_size=0.2, random_state=0)

# Modifier nom des colonnes
accuracy_cleaned_train['review'] = accuracy_cleaned_train['cleaned_review']
accuracy_cleaned_test['review'] = accuracy_cleaned_test['cleaned_review']
accuracy_corrected_train['review'] = accuracy_corrected_train['corrected_review']
accuracy_corrected_test['review'] = accuracy_corrected_test['corrected_review']
accuracy_lemmatized_train['review'] = accuracy_lemmatized_train['lemmatized_review']
accuracy_lemmatized_test['review'] = accuracy_lemmatized_test['lemmatized_review']

accuracy_new_label_train['review'] = accuracy_new_label_train['lemmatized_review']
accuracy_new_label_train['category'] = accuracy_new_label_train['new_y']
accuracy_new_label_test['review'] = accuracy_new_label_test['lemmatized_review']
accuracy_new_label_test['category'] = accuracy_new_label_test['new_y']

accuracy_default = train_model(accuracy_default_train, accuracy_default_test, './temporary')
accuracy_cleaned = train_model(accuracy_cleaned_train, accuracy_cleaned_test, './temporary')
accuracy_corrected = train_model(accuracy_corrected_train, accuracy_corrected_test, './temporary')
accuracy_lemmatized = train_model(accuracy_lemmatized_train, accuracy_lemmatized_test, './temporary')
accuracy_new_label = train_model(accuracy_new_label_train, accuracy_new_label_test, './temporary')

print('Default accuracy:', accuracy_default)
print('Cleaned text accuracy:', accuracy_cleaned)
print('Corrected text accuracy:', accuracy_corrected)
print('Lemmatized text accuracy:', accuracy_lemmatized)
print('New labels text accuracy:', accuracy_new_label)

Default accuracy: 0.5995
Cleaned text accuracy: 0.6105
Corrected text accuracy: 0.6275
Lemmatized text accuracy: 0.6355
New labels text accuracy: 0.795


Nous obtenons une nette amélioration en particulier suite aux nouveaux y