In [4]:
#IMPORTATION DES MODULES
import pandas as pd
import numpy as np
import unicodedata
import contractions
import inflect
import re

from collections import defaultdict

from nltk import pos_tag
from nltk import punkt
from nltk.corpus import stopwords, wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn import model_selection, naive_bayes, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

##### UNCOMMENT THIS SECTION IF FIRT TIME RUNNING
# import nltk
# nltk.download('wordnet')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
#####

# Set seed for random results base calculation
np.random.seed(500)

import seaborn as sns
sns.set(style = "darkgrid")

In [94]:
#LECTURE DES FICHIERS
print("\nDataframe des avis")
df_avis = pd.read_csv('Dataset/dataset.csv', sep = '\t', header = None, names = ['Avis'], encoding ='utf-8')
display(df_avis.head())
display(df_avis.shape)

print("\nDataframe des scores")
df_score = pd.read_csv('Dataset/labels.csv', sep = '\t', header = None, names = ['Score'], encoding ='utf-8')
display(df_score.head())
display(df_score.shape)

print("\nDataframe merged")
df = df_avis.join(df_score)
display(df.head())
display(df.shape)


Dataframe des avis


Unnamed: 0,Avis
0,Obviously made to show famous 1950s stripper M...
1,This film was more effective in persuading me ...
2,Unless you are already familiar with the pop s...
3,From around the time Europe began fighting Wor...
4,Im not surprised that even cowgirls get the bl...


(10000, 1)


Dataframe des scores


Unnamed: 0,Score
0,-1
1,-1
2,-1
3,-1
4,-1


(10000, 1)


Dataframe merged


Unnamed: 0,Avis,Score
0,Obviously made to show famous 1950s stripper M...,-1
1,This film was more effective in persuading me ...,-1
2,Unless you are already familiar with the pop s...,-1
3,From around the time Europe began fighting Wor...,-1
4,Im not surprised that even cowgirls get the bl...,-1


(10000, 2)

In [95]:
# Copie du DF originale
df2 = shuffle(df)
# Réinitialisation des index
#df2.reset_index()
df2.reset_index(inplace = True, 
                drop = True)
display(df2.head())

Unnamed: 0,Avis,Score
0,It makes sense to me that this film is getting...,-1
1,Kate Beckinsale steals the show! Bravo! Too ba...,1
2,This is probably one of the most original love...,1
3,I saw the new redubbed and edited version yest...,1
4,I had seen this film way back in the 80's and ...,1


In [96]:
#DÉFINITION DES CONSTANTES UTILISÉES
STOP_WORDS = set(stopwords.words('english'))
STOP_WORDS_EXCEPTIONS = set(('not',))

#dictionnaire des POS-Tags
POS_TAG_MAP = defaultdict(lambda : wn.NOUN)
POS_TAG_MAP['J'] = wn.ADJ
POS_TAG_MAP['V'] = wn.VERB
POS_TAG_MAP['R'] = wn.ADV

#DÉFINITION DES FONCTIONS DE PRÉTRAITEMENTS
def replace_contractions(document):
    """
    replaces contracted expressions in a document
    
    returns document with no contracted expressions
    """
    return contractions.fix(document)

def remove_urls(document):
    """
    removes all urls in the document
    
    return a document without any urls
    """
    return re.sub(r'https?://(www\.)?[-\w@:%.\+~#=]{2,256}\.[a-z]{2,6}\b([-\w@:%_\+.~#?&/=;]*)', '', document)

def clean_document(document):
    word_word = r'([a-zA-Z]+\.*)\.([a-zA-Z]+)'
    word_digit = r'([a-zA-Z]+\.*)\.(\d+)'
    digit_digit = r'(\d+\.*)\.([a-zA-Z]+)'
    patterns = [
        word_word, #word.word pattern
        word_digit, #word.digit pattern
        digit_digit, #digit.word pattern
    ]
    
    for pattern in patterns:
        if re.search(pattern, document):
            document = re.sub(pattern, r'\1. \2', document)
    
    return document

def remove_non_ascii(tokens):
    '''
    normalizes the tokens
    encodes tokens as ASCII characters from tokens
    and decodes as utf-8
    
    returns a list of normalized and encoded as ascii tokens
    '''
    return [unicodedata.normalize('NFKD', token)
           .encode('ascii', 'ignore')
           .decode('utf-8', 'ignore')
           for token in tokens]

def split_on_characterset(tokens, regex):
    '''
    splits a token in tokens upon matching with the characterset defined by the regex
    appends the tokens obtained from splitting the token to the tokens list
    
    returns a list of all tokens obtained after splitting problematic tokens
    '''
    new_tokens = []
    for token in tokens:
        if re.search(regex, token) :
            new_tokens += re.split(regex, token)
        else:
            new_tokens.append(token)
    return new_tokens

def to_lowercase(tokens):
    """returns a list of tokens in lowercase"""
    return [token.lower() for token in tokens]

def replace_numbers(tokens):
    """
    replaces tokens representing whole numeric values
    by their equivalent letter values
    
    returns a list of transformed tokens
    """
    engine = inflect.engine()
    new_tokens = []
    for token in tokens:
        new_token = token
        if token.isdigit():
            new_token = engine.number_to_words(token)
        new_tokens.append(new_token)
    return new_tokens

def remove_punctuation(tokens):
    """
    removes tokens not in \w and \s classes of characters.
    by extension, all punctuation characters will be removed
    
    returns a list of tokens only in \w and \s
    """
    new_tokens = []
    for token in tokens:
        new_token = re.sub(r'[^\w\s]', '', token)
        if new_token != '':
            new_tokens.append(new_token)
    return new_tokens

def remove_stopwords(tokens, stopwords, exceptions):
    '''
    removes all stopwords (a set) from tokens (a list)
    except those in exceptions (a set)
    
    returns a list of tokens that are not stopwords
    '''
    stop = stopwords - exceptions
    return [token for token in tokens if token not in stop]

def lemmatize(tokens, lemmatizer, pos_tag_map):
    '''
    lematizes all tokens using a lemmatizer and a POS-Tagging map
    
    returns the list of lemmatized tokens
    '''
    return [lemmatizer.lemmatize(token, pos_tag_map[tag[0]]) for token, tag in pos_tag(tokens)]
    
def normalize(tokens):
    '''
    normalizes all the tokens by using all preprocessing
    functions taking a list of tokens as input
    
    returns the list of normalized tokens
    '''
    tokens = remove_non_ascii(tokens)
    tokens = to_lowercase(tokens)
    tokens = split_on_characterset(tokens, r'[/\\~_-]')
    tokens = replace_numbers(tokens)
    tokens = remove_punctuation(tokens)
    tokens = remove_stopwords(tokens, STOP_WORDS, STOP_WORDS_EXCEPTIONS)
    tokens = lemmatize(tokens, WordNetLemmatizer(), POS_TAG_MAP)
    return tokens

def preprocess(document):
    document = replace_contractions(document) #remplacement des contractions dans le document
    document = remove_urls(document) #supprimer les urls dans le document
    document = clean_document(document)
    tokens = word_tokenize(document) #list des tokens du document
    tokens = normalize(tokens) #list des tokens normalisés du document
    document = ''.join([" " + token for token in tokens]).strip() #rejoindre les tokens normalisés pour obtenir le document nettoyé 
    return document

In [97]:
#Preprocessing dataset
df_transformed = df2.copy()
df_transformed['Avis'] = [preprocess(document) for document in df_transformed['Avis']]
display(df_transformed['Avis'].head())

0    make sense film get raf hollywood oftentimes h...
1    kate beckinsale steal show bravo bad knightly ...
2    probably one original love story see age espec...
3    saw new redubbed edit version yesterday love g...
4    see film way back eighty nearly forgotten noti...
Name: Avis, dtype: object

In [92]:
#splitting dataset
df_transformed1 = df_transformed.iloc[0:6500]

#Vectorizing
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df_transformed1['Avis'])
df_vectorized = pd.DataFrame(data=vectors.toarray(), columns=vectorizer.get_feature_names())
# display(df_vectorized.head(50))

In [None]:
y = df_transformed['Score']
validation_size = 0.3
test_size = 1 - validation_size
seed = 30

Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(
    X,
    y,
    train_size = validation_size,
    test_size = test_size,
    random_state = seed
)