In [10]:
#IMPORTATION DES MODULES
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

import re
import unicodedata
from time import time
from datetime import datetime

import pandas as pd
import numpy as np
import contractions
import inflect

from collections import defaultdict

from nltk import pos_tag
from nltk import punkt
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import shuffle

##### UNCOMMENT THIS SECTION IF FIRT TIME RUNNING
# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
#####

# Set seed for random results base calculation
np.random.seed(500)

In [2]:
#LECTURE DES FICHIERS
print("\nDataframe des avis")
df_avis = pd.read_csv('Dataset/dataset.csv', sep = '\t', header = None, names = ['Avis'], encoding ='utf-8')
display(df_avis.head())
display(df_avis.shape)

print("\nDataframe des scores")
df_score = pd.read_csv('Dataset/labels.csv', sep = '\t', header = None, names = ['Score'], encoding ='utf-8')
display(df_score.head())
display(df_score.shape)

print("\nDataframe merged")
df = df_avis.join(df_score)
display(df.head())
display(df.shape)


Dataframe des avis


Unnamed: 0,Avis
0,Obviously made to show famous 1950s stripper M...
1,This film was more effective in persuading me ...
2,Unless you are already familiar with the pop s...
3,From around the time Europe began fighting Wor...
4,Im not surprised that even cowgirls get the bl...


(10000, 1)


Dataframe des scores


Unnamed: 0,Score
0,-1
1,-1
2,-1
3,-1
4,-1


(10000, 1)


Dataframe merged


Unnamed: 0,Avis,Score
0,Obviously made to show famous 1950s stripper M...,-1
1,This film was more effective in persuading me ...,-1
2,Unless you are already familiar with the pop s...,-1
3,From around the time Europe began fighting Wor...,-1
4,Im not surprised that even cowgirls get the bl...,-1


(10000, 2)

In [3]:
# Copie du DF originale
df2 = shuffle(df)
# Réinitialisation des index
#df2.reset_index()
df2.reset_index(inplace = True, 
                drop = True)
display(df2.head())

Unnamed: 0,Avis,Score
0,After having read two or three negative review...,1
1,I recently (May 2008) discovered that this chi...,1
2,"Pathetic is the word. Bad acting, pathetic scr...",-1
3,Spencer Tracy and Katherine Hepburn would roll...,-1
4,This in my opinion is one of the best action m...,1


In [4]:
#DÉFINITION DES CONSTANTES UTILISÉES
STOP_WORDS = set(stopwords.words('english'))
STOP_WORDS_EXCEPTIONS = set(('not',))

#dictionnaire des POS-Tags
POS_TAG_MAP = defaultdict(lambda : wn.NOUN)
POS_TAG_MAP['J'] = wn.ADJ
POS_TAG_MAP['V'] = wn.VERB
POS_TAG_MAP['R'] = wn.ADV

#Seeds utilisé pour le CV score et la génération des training/test sets
TTS_SEED = 30 #training_test_split seed
CV_SEED = 7 #cross_val_score seed

#Métrique utilisé pour évaluer les classifieurs lors du CV score et GridSearchCV
SCORING = 'accuracy'

#DÉFINITION DES FONCTIONS DE PRÉTRAITEMENTS
def replace_contractions(document):
    """
    replaces contracted expressions in a document
    
    returns document with no contracted expressions
    """
    return contractions.fix(document)

def remove_urls(document):
    """
    removes all urls in the document
    
    return a document without any urls
    """
    return re.sub(r'https?://(www\.)?[-\w@:%.\+~#=]{2,256}\.[a-z]{2,6}\b([-\w@:%_\+.~#?&/=;]*)', '', document)

def clean_document(document):
    word_word = r'([a-zA-Z]+\.*)\.([a-zA-Z]+)'
    word_digit = r'([a-zA-Z]+\.*)\.(\d+)'
    digit_digit = r'(\d+\.*)\.([a-zA-Z]+)'
    patterns = [
        word_word, #word.word pattern
        word_digit, #word.digit pattern
        digit_digit, #digit.word pattern
    ]
    
    for pattern in patterns:
        if re.search(pattern, document):
            document = re.sub(pattern, r'\1. \2', document)
    
    return document

def remove_non_ascii(tokens):
    '''
    normalizes the tokens
    encodes tokens as ASCII characters from tokens
    and decodes as utf-8
    
    returns a list of normalized and encoded as ascii tokens
    '''
    return [unicodedata.normalize('NFKD', token)
           .encode('ascii', 'ignore')
           .decode('utf-8', 'ignore')
           for token in tokens]

def split_on_characterset(tokens, regex):
    '''
    splits a token in tokens upon matching with the characterset defined by the regex
    appends the tokens obtained from splitting the token to the tokens list
    
    returns a list of all tokens obtained after splitting problematic tokens
    '''
    new_tokens = []
    for token in tokens:
        if re.search(regex, token) :
            new_tokens += re.split(regex, token)
        else:
            new_tokens.append(token)
    return new_tokens

def to_lowercase(tokens):
    """returns a list of tokens in lowercase"""
    return [token.lower() for token in tokens]

def replace_numbers(tokens):
    """
    replaces tokens representing whole numeric values
    by their equivalent letter values
    
    returns a list of transformed tokens
    """
    engine = inflect.engine()
    new_tokens = []
    for token in tokens:
        new_token = token
        if token.isdigit():
            new_token = engine.number_to_words(token)
        new_tokens.append(new_token)
    return new_tokens

def remove_punctuation(tokens):
    """
    removes tokens not in \w and \s classes of characters.
    by extension, all punctuation characters will be removed
    
    returns a list of tokens only in \w and \s
    """
    new_tokens = []
    for token in tokens:
        new_token = re.sub(r'[^\w\s]', '', token)
        if new_token != '':
            new_tokens.append(new_token)
    return new_tokens

def remove_stopwords(tokens, stopwords, exceptions):
    '''
    removes all stopwords (a set) from tokens (a list)
    except those in exceptions (a set)
    
    returns a list of tokens that are not stopwords
    '''
    stop = stopwords - exceptions
    return [token for token in tokens if token not in stop]

def lemmatize(tokens, lemmatizer, pos_tag_map):
    '''
    lematizes all tokens using a lemmatizer and a POS-Tagging map
    
    returns the list of lemmatized tokens
    '''
    return [lemmatizer.lemmatize(token, pos_tag_map[tag[0]]) for token, tag in pos_tag(tokens)]
    
def normalize(tokens):
    '''
    normalizes all the tokens by using all preprocessing
    functions taking a list of tokens as input
    
    returns the list of normalized tokens
    '''
    tokens = remove_non_ascii(tokens)
    tokens = to_lowercase(tokens)
    tokens = split_on_characterset(tokens, r'[/\\~_-]')
    tokens = replace_numbers(tokens)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens, STOP_WORDS, STOP_WORDS_EXCEPTIONS)
    tokens = lemmatize(tokens, WordNetLemmatizer(), POS_TAG_MAP)
    return tokens

def preprocess(document):
    document = replace_contractions(document) #remplacement des contractions dans le document
    document = remove_urls(document) #supprimer les urls dans le document
    document = clean_document(document)
    tokens = word_tokenize(document) #list des tokens du document
    tokens = normalize(tokens) #list des tokens normalisés du document
    document = ''.join([" " + token for token in tokens]).strip() #rejoindre les tokens normalisés pour obtenir le document nettoyé 
    return document

In [5]:
#Preprocessing dataset
df_transformed = df2.copy()
df_transformed['Avis'] = [preprocess(document) for document in df_transformed['Avis']]
display(df_transformed['Avis'].head())

0    read two three negative review main page imdb ...
1    recently may two thousand and eight discover c...
2    pathetic word bad act pathetic script cheezy d...
3    spencer tracy katherine hepburn would roll gra...
4    opinion one best action movie 1970s not featur...
Name: Avis, dtype: object

In [6]:
#splitting dataset
df_transformed1 = df_transformed.iloc[:4000]

#Vectorizing
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df_transformed1['Avis'])

In [7]:
#Choix des features et des classes
X = vectors.toarray()
y = df_transformed1['Score']

#Création du dictionnaire contenant les classifieurs et leurs paramètres par défaut
models = {
    #'LogisticRegression': LogisticRegression(),
    #'SGDClassifier': SGDClassifier(),
    #'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier', RandomForestClassifier(),
    #'GaussianNB': GaussianNB(),
    #'KNeighborsClassifier': KNeighborsClassifier(),
    #'SVC': SVC(),
    'LinearSVC': LinearSVC()
}

#configuration des paramètres utilisés par la cross validation
k_fold = KFold(n_splits=10, shuffle=True, random_state=CV_SEED)

#cross validation sur l'ensemble des classifieurs choisis
#en utilisant la mesure accuracy
for name, model in models.items():
    print('Cross validation a commencé à {}'.format(datetime.now()))
    start_time = time()
    cv_score = cross_val_score(model, X, y, cv=k_fold, scoring=SCORING)
    output = """
    Temps pris pour la cross validation de {} : {} secondes
    Accuracy scores pour les 10 évaluations : {}
    Score moyen : {}
    Écart type des scores : {}
    """.format(name, time() - start_time, cv_score, cv_score.mean(), cv_score.std())
    print(output)


    Temps pris pour la cross validation de KNeighborsClassifier : 653.8217942714691 secondes
    Accuracy scores pour les 10 évaluations : [0.785  0.77   0.8025 0.7575 0.79   0.765  0.7875 0.7975 0.7925 0.7725]
    Score moyen : 0.7820000000000001
    Écart type des scores : 0.014133294025102577
    

    Temps pris pour la cross validation de SVC : 4856.297691583633 secondes
    Accuracy scores pour les 10 évaluations : [0.495  0.4925 0.4925 0.4875 0.45   0.47   0.485  0.4575 0.5    0.47  ]
    Score moyen : 0.48
    Écart type des scores : 0.016201851746019645
    


In [None]:
validation_size = 0.3
test_size = 1 - validation_size
seed = 30

Train_X, Test_X, Train_Y, Test_Y = train_test_split(
    X,
    y,
    train_size = validation_size,
    test_size = test_size,
    random_state = seed
)