In [0]:
from math import sqrt
from math import exp

import gensim
import nltk
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.tokenize import RegexpTokenizer
from sklearn import svm
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.dummy import DummyRegressor
from sklearn.preprocessing import StandardScaler

nltk.download('stopwords')
nltk.download('sentiwordnet')
nltk.download('wordnet')
from nltk.corpus import sentiwordnet as swn


def pre_processing(reviews):
    reviews.index = range(len(reviews))
    reviews = lower_case(reviews, "reviews.text")
    reviews = punctuation_tokenizer(data=reviews, col='reviews.text')
    reviews = retire_small_words(data=reviews, col='reviews.tokenize', treshold=3)
    reviews = retire_stop_words(data=reviews, col_to_reduce='reviews.tokenize')
    reviews = retire_numbers(data=reviews, col='reviews.tokenize')
    reviews = stem_reviews(df=reviews, col='reviews.tokenize')
    reviews = lemmatize(data=reviews, col_to_lemmatize='reviews.tokenize')
    return reviews


# Pre Processing
def bag_of_word(df, col, dict_of_words):
    bag_of_words_df = pd.DataFrame(index=range(len(df)), columns=list(dict_of_words.keys()))
    bag_of_words_df.fillna(0, inplace=True)
    bag_of_words_df['sentence_length'] = 0

    index_current_sentence = 0

    for sentence in df[col]:
        bag_of_words_df.at[index_current_sentence, 'sentence_length'] = len(sentence)
        for word in sentence:
            # si le mot n'est pas dans le df --> mot pas important alors on va dans le except
            try:
                if bag_of_words_df.at[index_current_sentence, word] == 0:
                    bag_of_words_df.at[index_current_sentence, word] = 1
                else:
                    bag_of_words_df.at[index_current_sentence, word] = \
                        bag_of_words_df.at[index_current_sentence, word] + 1
            except:
                continue

        index_current_sentence = index_current_sentence + 1

    return bag_of_words_df


def stem_reviews(df, col):
    stemmer = PorterStemmer()
    stemme_list = []
    for sentence in df[col]:
        stemme_list.append([stemmer.stem(word) for word in sentence])

    df[col] = stemme_list
    return df


def retire_numbers(data, col):
    index_current_sentence = 0
    for sentence in data[col]:
        index_current_word = 0
        for word in sentence:
            if bool(re.findall(r'[0-9]+', string=word)):
                sentence.remove(word)

            index_current_word = index_current_word + 1

        index_current_sentence = index_current_sentence + 1
    return data


# lower_case
def lower_case(data, col):
    data[col] = data[col].apply(lambda x: x.lower())
    return data


def punctuation_tokenizer(data, col):
    punctuation_tokenizer = RegexpTokenizer(r'\w+')
    data['reviews.tokenize'] = data[col].apply(lambda review: punctuation_tokenizer.tokenize(review))
    return data


def retire_stop_words(data, col_to_reduce):
    reduced_reviews = []
    stops = set(stopwords.words('english'))

    for review in data[col_to_reduce]:
        reduced_reviews.append(list(word for word in review if word not in stops))

    data[col_to_reduce] = reduced_reviews

    return data


def retire_small_words(data, col, treshold):
    reduced_reviews = []

    for review in data[col]:
        reduced_reviews.append(list(word for word in review if len(word) > treshold))

    data[col] = reduced_reviews

    return data


# Lemmatize
def lemmatize_list(lemmatizer, list_of_word):
    lemmatized_words = []
    for word in list_of_word:
        lemmatized_words.append(lemmatizer.lemmatize(word))

    return lemmatized_words


def lemmatize(data, col_to_lemmatize):
    from nltk.stem import WordNetLemmatizer

    lemmatizer = WordNetLemmatizer()

    lemmatize_review = []

    for review in data[col_to_lemmatize]:
        lemmatize_review.append(lemmatize_list(lemmatizer=lemmatizer, list_of_word=review))

    data['reviews.tokenize'] = lemmatize_review

    return data


def load_word2vec_model():
    # model = gensim.models.KeyedVectors.load_word2vec_format('./model/GoogleNews-vectors-negative300.bin', binary=True,limit=500000)
    model = gensim.models.KeyedVectors.load_word2vec_format('wiki-news-300d-1M.vec', binary=False)
    return model


def vectorize_sentence(vectorization_model, reviews, col_to_vectorize, dict_tfidf):
    data = pd.DataFrame(index=range(len(reviews)), dtype=np.float)
    word_tresh_hold = 500

    for word in range(word_tresh_hold):
        data[f'word_vector_{word}'] = 0

    index_current_sentance = 0
    for sentence in reviews[col_to_vectorize]:
        index_current_word = 0
        for word in sentence:
            if index_current_word < word_tresh_hold:
                # todo pca on sentence in stead of word2vec on word
                try:
                    data.at[index_current_sentance, f'word_vector_{index_current_word}'] = \
                        vectorization_model[word].mean() * 100000000 * dict_tfidf[word]
                    index_current_word = index_current_word + 1
                except:
                    index_current_word = index_current_word + 1
                    continue
            else:
                break

        index_current_sentance = index_current_sentance + 1

    normalizer = Normalizer().fit(data)
    normalized_data = normalizer.transform(data)

    normalized_df = pd.DataFrame(normalized_data, index=data.index, columns=data.columns)

    return normalized_df


def reduced_word_vectors_with_PCA(data, n_components):
    pca = PCA(n_components=n_components)
    comments = list(data.iterrows())

    reduced_df = pd.DataFrame(index=data.index)
    for i in range(len(comments)):
        reduced_df[i] = pca.fit_transform(comments[i])

    # pca.fit(list_of_word_vectors)
    return pca.singular_values_


def reduced_vocab_based_on_tfidf(dict_tf_idf, number_of_words_to_keep):
    tfidf_df = pd.DataFrame(dict_tf_idf.items(), index=range(len(dict_tf_idf)))
    tfidf_df.sort_values([1], axis=0, inplace=True, ascending=False)

    # keep only n words with best tfidf
    tfidf_df = tfidf_df[:number_of_words_to_keep]

    reduced_dict_tf_idf = {}

    for word in dict_tf_idf.keys():
        if word in list(tfidf_df[0]):
            reduced_dict_tf_idf[word] = dict_tf_idf[word]

    return reduced_dict_tf_idf


def filter_vocab_based_on_tf_idf(dict_tf_idf, df, col, tf_idf_treshold):
    for sentence in df[col]:
        for word in sentence:
            try:
                if dict_tf_idf[word] <= tf_idf_treshold:
                    sentence.remove(word)
            except:
                continue
    return df


def get_tfidf(df, col, tokenized_text=False):
    corpus = []
    list_of_word = []
    if not tokenized_text:
        df = punctuation_tokenizer(data=df, col=col)

    for sentence in df['reviews.tokenize']:
        joined_sentence = ""
        for word in sentence:
            # filter number
            if not bool(re.findall(r'[0-9]+', string=word)):
                joined_sentence = joined_sentence + " " + word
                list_of_word.append(word)

        corpus.append(joined_sentence)

    tfidf_vector = TfidfVectorizer()
    tfidf = tfidf_vector.fit_transform(corpus)

    tfidf_dict = dict(zip(tfidf_vector.get_feature_names(), tfidf.data))

    return tfidf_dict, corpus


def RMSE(prediction, target):
    rms = sqrt(mean_squared_error(target, prediction))
    return rms


def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def dict_words(reviews):
    list_word = {}
    for comment in reviews['reviews.tokenize']:
        for word in comment:
            list_word[word] = 0

    return list_word


def sentiWord(dict_word):
    reduced_dict_with_sent = {}

    for word in dict_word.keys():
        result = list(swn.senti_synsets(word))
        if result:
            reduced_dict_with_sent[word] = {f'pos_score': result[0].pos_score(),
                                            f'neg_score': result[0].neg_score(),
                                            f'obj_score': result[0].obj_score()}
        # else:
        #     dict_word[word] = {f'pos_score': 0,
        #                        f'neg_score': 0,
        #                        f'obj_score': 0}

    return reduced_dict_with_sent


def type_of_word(df):
    list_type = []
    res = nltk.pos_tag(df['Words'])
    for word in res:
        list_type.append(word[1])

    df['Type'] = list_type


def get_df_with_sentim_analysis(senti_dict, df, word2vec_model=None):
    if not word2vec_model:
        for word in df.columns:
            try:
                if senti_dict[word]['pos_score'] >= senti_dict[word]['neg_score']:
                    # on cherche dans le dataframe quand le mot est présent dans un commentaire
                    for index in df[df[word] != 0].index:
                        df.at[index, word] = df.at[index, word] + senti_dict[word]['pos_score'] + senti_dict[word]['obj_score']
                else:
                    for index in df[df[word] != 0].index:
                        df.at[index, word] = - (df.at[index, word] + senti_dict[word]['neg_score'] + senti_dict[word]['obj_score'])
            except:
                continue
    else:
        for word in df.columns:
            try:
                if senti_dict[word]['pos_score'] >= senti_dict[word]['neg_score']:
                    df[word] = df[word] + senti_dict[word]['pos_score'] + word2vec_model[word].mean()
                else:
                    df[word] = - df[word] + senti_dict[word]['neg_score'] + word2vec_model[word].mean()
            except:
                continue

    return df


def vectorize_reviews(word2vec_model, df, col):
    punctuation_tokenizer = RegexpTokenizer(r'\w+')
    df['reviews.to_vectorize'] = df[col].apply(lambda review: punctuation_tokenizer.tokenize(review))
    reviews_vectors = []
    pca = PCA(n_components=1)

    for sentence in df['reviews.to_vectorize']:
        vectorized_words = []
        for word in sentence:
            try:
                vectorized_words.append(word2vec_model[word])
            except:
                continue
        if vectorized_words:
            try:
                pca_result = pca.fit_transform(vectorized_words)
            except:
                pca_result = np.array([0] * len(sentence))

            reviews_vectors.append(sum(pca_result)[0])
        else:
            reviews_vectors.append(0)

    return reviews_vectors


def Normalize(df):
    normalizer = Normalizer().fit(df)
    normalized_data = normalizer.transform(df)
    df = pd.DataFrame(
        normalized_data,
        index=df.index,
        columns=df.columns)
    return df


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authentification Google
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)
# Download du fichier
id = '1jX82w9vyYjtqz_8p_6N5SnEVhI4kyQbR'
downloaded = drive.CreateFile({'id': id}) 
downloaded.GetContentFile('hotel_reviews.csv')

id_model = '1dbvPvUpzCPkN3kUXwm22KUsawvOxHRzq'
downloaded_model = drive.CreateFile({'id': id_model}) 
downloaded_model.GetContentFile('wiki-news-300d-1M.vec')

if __name__ == '__main__':
    reviews = pd.read_csv('http://christophe-rodrigues.fr/hotel_reviews.csv' ,sep=';',encoding='utf-8')
    reviews.drop("Unnamed: 0", inplace=True, axis=1)

    print("Split Train and Test")
    train_set, test_set = train_test_split(reviews, test_size=0.20, shuffle=True)
    train_set.index = range(len(train_set))
    test_set.index = range(len(test_set))

    print("Pre processing")
    pre_processed_trained_reviews = pre_processing(reviews=train_set)
    pre_processed_test_reviews = pre_processing(reviews=test_set)

    dict_tfidf_trained, corpus = get_tfidf(pre_processed_trained_reviews, 'reviews.tokenize', tokenized_text=True)
    dict_tfidf_test, _ = get_tfidf(pre_processed_test_reviews, 'reviews.tokenize', tokenized_text=True)

    dict_words_trained = reduced_vocab_based_on_tfidf(
        dict_tf_idf=dict_tfidf_trained,
        number_of_words_to_keep=1400)

    dict_words_test = reduced_vocab_based_on_tfidf(
        dict_tf_idf=dict_tfidf_test,
        number_of_words_to_keep=1400)

    pos_neg_trained_word_dict = sentiWord(dict_words_trained)
    pos_neg_test_word_dict = sentiWord(dict_words_test)

    train_bag_of_word_df = bag_of_word(
        df=pre_processed_trained_reviews,
        col="reviews.tokenize",
        dict_of_words=pos_neg_trained_word_dict)

    test_bag_of_word_df = bag_of_word(
        df=pre_processed_test_reviews,
        col="reviews.tokenize",
        dict_of_words=pos_neg_trained_word_dict)

    print("load Word2Vec model")
    model = load_word2vec_model()

    print('Vectorizing')
    trained_vectors = vectorize_reviews(df=pre_processed_trained_reviews, col='reviews.text', word2vec_model=model)
    test_vectors = vectorize_reviews(df=pre_processed_test_reviews, col='reviews.text', word2vec_model=model)

    print('Getting sentiment analysis')
    senti_analysis_trained_df = get_df_with_sentim_analysis(
        senti_dict=pos_neg_trained_word_dict,
        df=train_bag_of_word_df)
    senti_analysis_test_df = get_df_with_sentim_analysis(
        senti_dict=pos_neg_test_word_dict,
        df=test_bag_of_word_df)

    senti_analysis_trained_df['vectors'] = trained_vectors
    senti_analysis_test_df['vectors'] = test_vectors

    senti_analysis_trained_df = Normalize(df=senti_analysis_trained_df)
    senti_analysis_test_df = Normalize(df=senti_analysis_test_df)

    print('Training')
    reg = svm.SVR(C=20,epsilon=0.1)
    reg.fit(senti_analysis_trained_df, train_set['reviews.rating'])
    print('Predicting')
    prediction = reg.predict(senti_analysis_test_df)
    target = test_set["reviews.rating"]

    print(f'RMSE : {RMSE(prediction, target)}')
    print(f'MAPE : {mean_absolute_percentage_error(y_pred=prediction, y_true=target)}')
    print(f'Precision : {100 - mean_absolute_percentage_error(y_pred=prediction, y_true=target)}')


Split Train and Test
Pre processing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_gui

load Word2Vec model


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Vectorizing


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Getting sentiment analysis
Training




Predicting
RMSE : 1.136058161207676
MAPE : 36.437041376920284
Precision : 63.562958623079716


In [0]:
# function to process data to be able to train or predict with model
def data_prep(df):
  print("Pre processing")
  pre_processed_reviews = pre_processing(reviews=df)

  # dict_tfidf, corpus = get_tfidf(pre_processed_reviews, 'reviews.tokenize', tokenized_text=True)

  # dict_tfidf = reduced_vocab_based_on_tfidf(
  #     dict_tf_idf=dict_tfidf,
  #     number_of_words_to_keep=1400)

  # pos_neg_trained_word_dict = sentiWord(dict_tfidf)

  bag_of_word_df = bag_of_word(
      df=pre_processed_reviews,
      col="reviews.tokenize",
      dict_of_words=pos_neg_trained_word_dict)

  print('Vectorizing')
  review_vectors = vectorize_reviews(df=pre_processed_reviews, col='reviews.text', word2vec_model=model)

  print('Getting sentiment analysis')
  senti_analysis_df = get_df_with_sentim_analysis(
      senti_dict=pos_neg_trained_word_dict,
      df=bag_of_word_df)

  senti_analysis_df['vectors'] = review_vectors

  senti_analysis_df = Normalize(df=senti_analysis_df)

  return senti_analysis_df

In [0]:
# Evaluation
import pandas as pd
from sklearn.metrics import mean_squared_error
import random

#loading eval datas
column_names = ['reviews.rating','reviews.text']
eval_data = pd.read_csv('http://christophe-rodrigues.fr/eval_reviews.csv', usecols=column_names, sep=";")

#here an example of the simplest possible model
#take reviews in input and return ratings 
def my_random_model(reviews):
  res = []
  for review in reviews:
      res.append(1+4*random.random()) #any real between [1;5]
  return pd.DataFrame(res)

#you need to adapt this line replacing this random model by yours best model.
#take in input all the text reviews (not ratings) and returns predicted ratings.
pre_proccessed_data = data_prep(eval_data)
eval_predicted = reg.predict(pre_proccessed_data)

for i in range(len(eval_predicted)):
  eval_predicted[i] = round(eval_predicted[i])

print(mean_squared_error(eval_predicted,eval_data['reviews.rating']))
#your MSE must be lower that this one, otherwise your are worst than random :-p
#In case your model is a regresssion one, don't forget to round ratings before calling MSE. (It will allow me to compare classification and regression models fairly)