#### Importation des bibliothèques

In [37]:
import pandas as pd
import numpy as np


#### Lecture des donnees

In [38]:
df=pd.read_csv('movie_review.csv')
df

Unnamed: 0,fold_id,cv_tag,html_id,sent_id,text,tag
0,0,cv000,29590,0,films adapted from comic books have had plenty...,pos
1,0,cv000,29590,1,"for starters , it was created by alan moore ( ...",pos
2,0,cv000,29590,2,to say moore and campbell thoroughly researche...,pos
3,0,cv000,29590,3,"the book ( or "" graphic novel , "" if you will ...",pos
4,0,cv000,29590,4,"in other words , don't dismiss this film becau...",pos
...,...,...,...,...,...,...
64715,9,cv999,14636,20,that lack of inspiration can be traced back to...,neg
64716,9,cv999,14636,21,like too many of the skits on the current inca...,neg
64717,9,cv999,14636,22,"after watching one of the "" roxbury "" skits on...",neg
64718,9,cv999,14636,23,"bump unsuspecting women , and . . . that's all .",neg


#### Columns pour la classification

In [39]:
text=df["text"]
tag=df["tag"]

In [40]:
#creation dun nouveau dataframe et l'ajout du Text (features) et Tag (label)
NewDf = pd.DataFrame()
NewDf["text"] =text
NewDf["tag"] =tag
NewDf

Unnamed: 0,text,tag
0,films adapted from comic books have had plenty...,pos
1,"for starters , it was created by alan moore ( ...",pos
2,to say moore and campbell thoroughly researche...,pos
3,"the book ( or "" graphic novel , "" if you will ...",pos
4,"in other words , don't dismiss this film becau...",pos
...,...,...
64715,that lack of inspiration can be traced back to...,neg
64716,like too many of the skits on the current inca...,neg
64717,"after watching one of the "" roxbury "" skits on...",neg
64718,"bump unsuspecting women , and . . . that's all .",neg


## Pre-processing des données textuelles :

### Lower casing

In [41]:
NewDf["text"]=NewDf["text"].str.lower()


### Removal of Punctuations

In [42]:
import string

PUNCT_TO_REMOVE = string.punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))

NewDf["text"] = NewDf["text"].apply(remove_punctuation)


### Removal of Stopwords

In [43]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

STOPWORDS = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

NewDf["text"] = NewDf["text"].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
NewDf

Unnamed: 0,text,tag
0,films adapted comic books plenty success wheth...,pos
1,starters created alan moore eddie campbell bro...,pos
2,say moore campbell thoroughly researched subje...,pos
3,book graphic novel 500 pages long includes nea...,pos
4,words dont dismiss film source,pos
...,...,...
64715,lack inspiration traced back insipid characters,neg
64716,like many skits current incarnation saturdayni...,neg
64717,watching one roxbury skits snl come away chara...,neg
64718,bump unsuspecting women thats,neg


## Entraînement du modèle Word2Vec

In [45]:
from gensim.models import Word2Vec
# notre texte prétraité
preprocessed_text =NewDf["text"]
# Tokenize le texte prétraité
tokens = preprocessed_text.apply(lambda x: x.split())

tokens

0        [films, adapted, comic, books, plenty, success...
1        [starters, created, alan, moore, eddie, campbe...
2        [say, moore, campbell, thoroughly, researched,...
3        [book, graphic, novel, 500, pages, long, inclu...
4                     [words, dont, dismiss, film, source]
                               ...                        
64715    [lack, inspiration, traced, back, insipid, cha...
64716    [like, many, skits, current, incarnation, satu...
64717    [watching, one, roxbury, skits, snl, come, awa...
64718                   [bump, unsuspecting, women, thats]
64719    [watching, anightattheroxbury, youll, left, ex...
Name: text, Length: 64720, dtype: object

In [46]:
model = Word2Vec(tokens, vector_size=50,min_count=1, window=2)


In [47]:
# Accédez aux vecteurs de mots
word_vectors = model.wv

# Obtenez la représentation vectorielle d'un mot spécifique
vector_for_word = word_vectors['mot']

# Affichez la représentation vectorielle du mot
print(vector_for_word)


[-8.52241181e-03 -1.00752590e-02 -6.95133815e-03 -2.68813851e-03
  1.62945427e-02 -7.40428455e-03 -1.47035606e-02  5.07309334e-03
 -2.24050740e-03 -7.25701277e-04  1.12276282e-02  1.09838611e-02
  1.45027246e-02 -6.84975507e-03  1.11495443e-02 -1.74803678e-02
  3.65319639e-03 -1.09445974e-02 -2.56523900e-02  1.37499534e-03
  7.67212827e-03 -2.44254316e-03  2.63777729e-02  6.85338257e-03
  6.24917157e-04  2.35131830e-02 -2.06508264e-02  1.46928225e-02
 -2.27230281e-04  4.72025495e-05  2.09715944e-02  7.42070097e-03
  1.27475942e-02 -6.17375073e-04  1.17977976e-03  2.18528938e-02
  1.68753248e-02 -5.05381171e-03  2.36663688e-03 -1.14049744e-02
 -5.22657670e-03  8.18961486e-03  1.43480478e-02  8.09842162e-03
  1.99491694e-03  6.54826732e-03 -4.07608738e-03  1.10645173e-02
  1.44755738e-02 -1.30239083e-02]


In [48]:
# Calculer la similarité entre deux mots
similarity = model.wv.similarity('novel', 'book')
print("Similarité entre 'novel' et 'book': ", similarity)

Similarité entre 'novel' et 'book':  0.9664345


In [49]:
similarity = model.wv.similarity('page', 'book')
print("Similarité entre 'page' et 'book': ", similarity)

Similarité entre 'page' et 'book':  0.94731915


In [50]:
similarity = model.wv.similarity('charact', 'actor')
print("Similarité entre 'charact' et 'actor': ", similarity)

Similarité entre 'charact' et 'actor':  0.76376307


In [51]:
# Obtenez les représentations vectorielles des mots
word_vectors = model.wv
word_vectors

<gensim.models.keyedvectors.KeyedVectors at 0x1caa4259a10>

## Vectorisation des reviews de movies :

In [58]:
word_vectors = model.wv

# Liste pour stocker les vecteurs de chaque critique
reviews_vectorized = []

# Parcourir chaque liste de tokens dans tokens
for review_tokens in tokens:
    # Initialiser un vecteur pour la critique
    review_vector = np.zeros(model.vector_size)
    # Compter le nombre de mots présents dans le modèle Word2Vec pour cette critique
    words_count = 0
    # Parcourir chaque mot dans la critique
    for word in review_tokens:
        if word in model.wv:
            # Si le mot est présent dans le vocabulaire du modèle, ajoutez son embedding à la représentation de la critique
            review_vector += model.wv[word]
            # Incrémenter le compteur de mots
            words_count += 1
    # Prendre la moyenne des embeddings pour obtenir la représentation de la critique
    if words_count != 0:
        review_vector /= words_count
    # Ajouter le vecteur de critique à la liste des vecteurs de critiques
    reviews_vectorized.append(review_vector)



## Division des données :

In [53]:
from sklearn.model_selection import train_test_split

# Définir les features (text) et la cible (tag)
y = NewDf["tag"]

# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(reviews_vectorized, y, test_size=0.2, random_state=42)

# Afficher les tailles des ensembles
print("Taille de l'ensemble d'entraînement (features) :", len(X_train),"  Taille de l'ensemble de test (features) :", len(X_test))

print("Taille de l'ensemble d'entraînement (cibles) :", len(y_train),"   Taille de l'ensemble de test (cibles) :", len(y_test))



Taille de l'ensemble d'entraînement (features) : 51776   Taille de l'ensemble de test (features) : 12944
Taille de l'ensemble d'entraînement (cibles) : 51776    Taille de l'ensemble de test (cibles) : 12944


## Construction d'un classificateur :


In [54]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

# Convertir les étiquettes en valeurs numériques:"pos" en 1 et "neg" en 0
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.fit_transform(y_test)

y_train


array([0, 1, 0, ..., 1, 1, 0])

In [55]:
from sklearn.linear_model import LogisticRegression
# Instanciation du modèle LogisticRegression
reg = LogisticRegression(solver='saga')

# Entraînement du modèle
reg.fit(X_train, y_train)

# Prédiction sur l'ensemble de test
y_pred = reg.predict(X_test)

y_pred


array([1, 0, 0, ..., 0, 0, 1])

In [56]:
y_test

array([0, 0, 0, ..., 1, 0, 0])

## Évaluation du modèle :

In [57]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calcul de l'accuracy,la precision,recall,F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, )
recall = recall_score(y_test, y_pred, )
f1 = f1_score(y_test, y_pred, )

print("Accuracy:", accuracy,"\nPrecision", precision,"\nRecall:", recall,"\nF1-score:", f1)

Accuracy: 0.5686804697156984 
Precision 0.5622015581804474 
Recall: 0.6806633196409554 
F1-score: 0.6157869382699058
