In [2]:
# Machine Learning
import joblib
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer

# Data manipulation
import pandas as pd
import numpy as np

# NLP
from gensim.models import Word2Vec, KeyedVectors
# Fichier nlp.py a étudier!
from nlp import text2tokens

# Graphs
import matplotlib.pyplot as plt

# Parallel apply on pandas dataframe
from pandarallel import pandarallel
pandarallel.initialize()






import json
from nlp import text2tokens, text2vec
from nltk.corpus import stopwords
import string
from gensim.models import Word2Vec, KeyedVectors


INFO: Pandarallel will run on 40 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
# Lecture de "l'email"
with open("email.json", "r") as fp:
    email = json.load(fp)

# Data prep

In [4]:
GARBAGE = {"'s", "n't", '...', 'oh',"'m", "'re", "'", "''", "'ve", "'ll", "'d", "``" }
STOP_WORDS = set(stopwords.words('english')).union(GARBAGE)


def clean_tokens(tokens):
    return [token.lower() for token in tokens if token not in string.punctuation]

def remove_stop_words(tokens):
    return [token for token in tokens if token not in STOP_WORDS]

def sentence2tokens(sentence):
    tokens = text2tokens(sentence)
    tokens = clean_tokens(tokens)
    tokens = remove_stop_words(tokens)
    return tokens

# Chargons le modèle de NLP

In [5]:
nlp_model = joblib.load('nlp_model.joblib')
ml = nlp_model["ml"]
idf = nlp_model["idf"]
wv = nlp_model["wv"]



## Simplifions le calcul du sentiment identifié dans les textes

In [6]:
def compute_sentiment(text, wv, idf, ml, threshold=0.55):
    # NLP : feature exctraction
    tokens = text2tokens(text)
    vector = text2vec(wv, idf, tokens)
    # Compute prediction
    prediction = ml.predict_proba(vector.reshape(1, -1))[0]
    # Use positive class proba and threshold to estimate sentiment
    sentiment = (prediction[1] > threshold)
    return sentiment

# Chargons le modèle de Word2Vec

In [7]:
model = Word2Vec.load("word2vec.model")
wv = model.wv

In [8]:
for i in range(len(email)):
    appreciation = email[i]["appreciation"]
    print(appreciation)
    sentiment = compute_sentiment(appreciation, wv, idf, ml)
    print(sentiment)
    break

Without giving too much away, there is a fade to white an hour into the film.
True


In [12]:
appreciation = email[i]["appreciation"]
tokens = sentence2tokens(appreciation)
tokens

['without', 'giving', 'much', 'away', 'fade', 'white', 'hour', 'film']

In [21]:
similar = wv.most_similar(positive = ["bad"], negative = [tokens[0]])
similar

[('terrible', 0.44631943106651306),
 ('awful', 0.4452797770500183),
 ('good', 0.41658562421798706),
 ('horrible', 0.41330885887145996),
 ('dreadful', 0.3839857578277588),
 ('okay', 0.3715684413909912),
 ('poor', 0.35615599155426025),
 ('dumb', 0.35115286707878113),
 ('dolls', 0.34781792759895325),
 ('scary', 0.3412056863307953)]

In [22]:
new_appreciation = appreciation.replace(tokens[1], similar[0][0])
print(new_appreciation)
sentiment = compute_sentiment(new_appreciation, wv, idf, ml)
sentiment

Without terrible too much away, there is a fade to white an hour into the film.


True