In [29]:
print("Partie 1 — Préparation de l’environnement")

Partie 1 — Préparation de l’environnement


In [17]:
# Importation des bibliothèques nécessaires

import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import spacy

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer




In [3]:
# Chargement des données
df = pd.read_csv('twitter_training.csv', header=None, names=['id','film','sentiment','review'],encoding='utf-8', low_memory=False)
df

Unnamed: 0,id,film,sentiment,review
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...,...
74677,9200,Nvidia,Positive,Just realized that the Windows partition of my...
74678,9200,Nvidia,Positive,Just realized that my Mac window partition is ...
74679,9200,Nvidia,Positive,Just realized the windows partition of my Mac ...
74680,9200,Nvidia,Positive,Just realized between the windows partition of...


In [4]:
# Préparation des données
df['polarity']=df['sentiment']

In [5]:
# Conversion des étiquettes de sentiment en valeurs numériques
df['polarity'] = df['polarity'].replace({'Negative': -1})
df['polarity'] = df['polarity'].replace({'Positive':1})
df['polarity'] = df['polarity'].replace({'Neutral': 0})
df['polarity'] = df['polarity'].replace({'Irrelevant':0})
df.head() 

  df['polarity'] = df['polarity'].replace({'Irrelevant':0})


Unnamed: 0,id,film,sentiment,review,polarity
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,1
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,1
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,1
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,1
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...,1


In [6]:
print("Partie 2 — Nettoyage du texte")

Partie 2 — Nettoyage du texte


In [7]:
# Fonction de nettoyage du texte

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'[^\w\s]', '', text)  # supprime ponctuation
    return text


In [12]:
# Application du nettoyage du texte
df['cleaned_review'] = df['review'].fillna("").map(clean_text)
df.head(2)


Unnamed: 0,id,film,sentiment,review,polarity,cleaned_review
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,1,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,1,i am coming to the borders and i will kill you...


In [41]:
print("Partie 3 — Tokenisation et Stopwords")

Partie 3 — Tokenisation et Stopwords


In [None]:
# stopwords
stop_words = set(stopwords.words('english'))
stop_words.update(("a", "the", "this", "pic","like", "one","<br />"))
punct = re.compile(r'(\w+)')

In [None]:
# Tokeniser et filtrer les stopwords
def tokenize_and_remove_stopwords(text):
    tokens = word_tokenize(text)
    filtered_tokens = [w for w in tokens if w not in stop_words]
    return filtered_tokens


In [46]:
# Application de la tokenisation et suppression des stopwords
df['tokens'] = df['cleaned_review'].map(tokenize_and_remove_stopwords)
df[['cleaned_review','tokens']].head(5)



Unnamed: 0,cleaned_review,tokens
0,im getting on borderlands and i will murder yo...,"[im, getting, borderlands, murder]"
1,i am coming to the borders and i will kill you...,"[coming, borders, kill]"
2,im getting on borderlands and i will kill you all,"[im, getting, borderlands, kill]"
3,im coming on borderlands and i will murder you...,"[im, coming, borderlands, murder]"
4,im getting on borderlands 2 and i will murder ...,"[im, getting, borderlands, 2, murder]"


In [47]:
print("Partie 4 — Lemmatisation")

Partie 4 — Lemmatisation


In [50]:
# Lemmatiser les tokens
lemmatizer = WordNetLemmatizer()
def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(token) for token in tokens]

In [51]:
# Application de la lemmatisation
df['lemmatized_tokens'] = df['tokens'].map(lemmatize_tokens)
df[['tokens','lemmatized_tokens']].head(5)

Unnamed: 0,tokens,lemmatized_tokens
0,"[im, getting, borderlands, murder]","[im, getting, borderland, murder]"
1,"[coming, borders, kill]","[coming, border, kill]"
2,"[im, getting, borderlands, kill]","[im, getting, borderland, kill]"
3,"[im, coming, borderlands, murder]","[im, coming, borderland, murder]"
4,"[im, getting, borderlands, 2, murder]","[im, getting, borderland, 2, murder]"


In [None]:
print("Partie 5 — Représentation Bag of Words (BoW)")

In [24]:
sample_texts = df['cleaned_review'].head(4).tolist()


In [25]:
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(sample_texts)


In [26]:
# Matrice BoW
print("Matrice BoW (shape={}):\n".format(X_bow.shape))
print(X_bow.toarray())

# Liste des mots extraits
print("\nListe des mots extraits :")
print(vectorizer.get_feature_names_out())


Matrice BoW (shape=(4, 15)):

[[1 0 1 1 0 0 1 1 0 1 1 0 0 1 1]
 [1 1 1 0 1 1 0 0 1 0 0 1 1 1 1]
 [1 0 1 1 0 0 1 1 1 0 1 0 0 1 1]
 [1 0 1 1 0 1 0 1 0 1 1 0 0 1 1]]

Liste des mots extraits :
['all' 'am' 'and' 'borderlands' 'borders' 'coming' 'getting' 'im' 'kill'
 'murder' 'on' 'the' 'to' 'will' 'you']


In [None]:
print("Partie 6 — Représentation TF-IDF")

In [18]:
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(sample_texts)


In [23]:

tfidf_df = pd.DataFrame(
    X_tfidf.toarray(), 
    index=[f"Review {i+1}" for i in range(len(sample_texts))],
    columns=tfidf_vectorizer.get_feature_names_out()
)
print("Matrice TF-IDF (shape={}):\n".format(X_tfidf.shape))
print(tfidf_df)


Matrice TF-IDF (shape=(4, 15)):

               all        am       and  borderlands   borders    coming  \
Review 1  0.276782  0.000000  0.276782     0.338544  0.000000  0.000000   
Review 2  0.207373  0.397387  0.207373     0.000000  0.397387  0.313305   
Review 3  0.276782  0.000000  0.276782     0.338544  0.000000  0.000000   
Review 4  0.276782  0.000000  0.276782     0.338544  0.000000  0.418169   

           getting        im      kill    murder        on       the  \
Review 1  0.418169  0.338544  0.000000  0.418169  0.338544  0.000000   
Review 2  0.000000  0.000000  0.313305  0.000000  0.000000  0.397387   
Review 3  0.418169  0.338544  0.418169  0.000000  0.338544  0.000000   
Review 4  0.000000  0.338544  0.000000  0.418169  0.338544  0.000000   

                to      will       you  
Review 1  0.000000  0.276782  0.276782  
Review 2  0.397387  0.207373  0.207373  
Review 3  0.000000  0.276782  0.276782  
Review 4  0.000000  0.276782  0.276782  
