In [221]:
# import important modules
import numpy as np
import pandas as pd
# sklearn modules
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB  # classifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    plot_confusion_matrix,
)
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# text preprocessing modules
from string import punctuation
# text preprocessing modules
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re  # regular expression

# Download dependency
for dependency in (
        "brown",
        "names",
        "wordnet",
        "averaged_perceptron_tagger",
        "universal_tagset",
):
    nltk.download(dependency)

import warnings

warnings.filterwarnings("ignore")
# seeding
np.random.seed(123)

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\Administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     C:\Users\Administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [222]:
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [223]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Administrateur\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [224]:
# load data
data = pd.read_csv("allocine_inception_avis9.csv")

In [225]:
# show top five rows of data
data.head(15)

Unnamed: 0,Note,Description,key
0,50,Après le chef d'oeuvre super-héroïque The Dark...,0
1,50,C’est fou ce qu’on aime détester Christopher N...,1
2,50,CHEF D’ŒUVRE ! Le film est absolument parfait ...,2
3,50,"Un film aussi novateur que complexe, dont la m...",3
4,50,Christopher Nolan est sûrement l'un des seuls ...,4
5,50,Nolan est un vrai génie. On l'avait déjà entra...,5
6,35,Le meilleur blockbuster de 2010 a pour thème l...,6
7,40,""" inception "" Un film de science fiction très ...",7
8,50,Un film époustouflant(un de mes préférés) ! Le...,8
9,50,"Dans la catégorie des blockbusters, on a rarem...",9


In [226]:
# check the shape of the data
data.shape

(7214, 3)

In [227]:
# check missing values in data
data.isnull().sum()

Note           0
Description    1
key            0
dtype: int64

In [228]:
data.dropna(inplace=True)

In [229]:
data.shape

(7213, 3)

In [230]:
# evalute news sentiment distribution
data.Note.value_counts()

5,0    3513
4,5    1214
4,0     808
0,5     344
3,0     310
3,5     294
2,0     259
2,5     193
1,0     186
1,5      92
Name: Note, dtype: int64

In [231]:
def setClassBin(i):
    if (float(i.replace(',', '.')) > 4.5):
        return 1
    else:
        return 0

data['Note'] = [setClassBin(x) for x in data.Note]


In [232]:
# evalute news sentiment distribution
data.Note.value_counts()

0    3700
1    3513
Name: Note, dtype: int64

In [233]:
data.head(10)

Unnamed: 0,Note,Description,key
0,1,Après le chef d'oeuvre super-héroïque The Dark...,0
1,1,C’est fou ce qu’on aime détester Christopher N...,1
2,1,CHEF D’ŒUVRE ! Le film est absolument parfait ...,2
3,1,"Un film aussi novateur que complexe, dont la m...",3
4,1,Christopher Nolan est sûrement l'un des seuls ...,4
5,1,Nolan est un vrai génie. On l'avait déjà entra...,5
6,0,Le meilleur blockbuster de 2010 a pour thème l...,6
7,0,""" inception "" Un film de science fiction très ...",7
8,1,Un film époustouflant(un de mes préférés) ! Le...,8
9,1,"Dans la catégorie des blockbusters, on a rarem...",9


In [234]:
#The text_cleaning() function will handle all necessary steps to clean our dataset.
stop_words =  stopwords.words('french')
def text_cleaning(text, remove_stop_words=True, lemmatize_words=True):
    # Clean the text, with the option to remove stop_words and to lemmatize word
    # Clean the text
    text = re.sub(r"[^A-Za-z0-9]", " ", text)
    text = re.sub(r"\'s", " ", text)
    text =  re.sub(r'http\S+',' link ', text)
    text = re.sub(r'\b\d+(?:\.\d+)?\s+', '', text) # remove numbers
        
    # Remove punctuation from text
    text = ''.join([c for c in text if c not in punctuation])
    
    # Optionally, remove stop words
    if remove_stop_words:
        text = text.split()
        text = [w for w in text if not w in stop_words]
        text = " ".join(text)
    
    # Optionally, shorten words to their stems
    if lemmatize_words:
        text = text.split()
        lemmatizer = WordNetLemmatizer() 
        lemmatized_words = [lemmatizer.lemmatize(word) for word in text]
        text = " ".join(lemmatized_words)
    
    # Return a list of words
    return(text)

In [235]:
#clean the review
data["cleaned_review"] = data["Description"].apply(text_cleaning)

In [236]:
data.head()

Unnamed: 0,Note,Description,key,cleaned_review
0,1,Après le chef d'oeuvre super-héroïque The Dark...,0,Apr chef oeuvre super h ro The Dark Knight Chr...
1,1,C’est fou ce qu’on aime détester Christopher N...,1,C fou aime tester Christopher Nolan Plus film ...
2,1,CHEF D’ŒUVRE ! Le film est absolument parfait ...,2,CHEF D UVRE Le film absolument parfait acteurs...
3,1,"Un film aussi novateur que complexe, dont la m...",3,Un film aussi novateur complexe dont mise sc t...
4,1,Christopher Nolan est sûrement l'un des seuls ...,4,Christopher Nolan rement seuls r alisateurs av...


In [237]:
#split features and target from  data 
X = data["cleaned_review"]
y = data.Note.values

In [238]:
# split data into train and validate
X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=42,
    shuffle=True,
    stratify=y,
)

In [239]:
# Create a classifier in pipeline
sentiment_classifier = Pipeline(steps=[
                               ('pre_processing',TfidfVectorizer(lowercase=False)),
                                 ('clf',LogisticRegression())])

In [240]:
# train the sentiment classifier 
sentiment_classifier.fit(X_train,y_train)

Pipeline(steps=[('pre_processing', TfidfVectorizer(lowercase=False)),
                ('clf', LogisticRegression())])

In [241]:
# test model performance on valid data 
y_preds = sentiment_classifier.predict(X_valid)

In [242]:
accuracy_score(y_valid,y_preds)

0.7384473197781886

In [243]:
#save model 
import joblib 
joblib.dump(sentiment_classifier, 'sentimentsfrench_model_pipeline.pkl')

['sentimentsfrench_model_pipeline.pkl']