In [None]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from collections import Counter
import numpy as np
import os
import warnings
import re
import time
import itertools
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

import gensim #word2vec
from gensim.models import Word2Vec

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential 
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM 
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras import utils


In [None]:
#voir les variables déclarées
whos

In [None]:
len(df)

# Charge the data

In [None]:

names = ["target","ids","date","flag","user","text"]
df = pd.read_csv("training.1600000.processed.noemoticon.csv",encoding = "latin-1", names = names)
print("Dataset size :",len(df))

In [None]:
df.head()

Map target label to String
- 0 -> NEGATIVE
- 2 -> NEUTRAL
- 4 -> POSITIVE

In [None]:

decode_map = {0 : "NEGATIVE",2 : "NEUTRAL", 4 : "POSITIVE"}
df.target = df.target.apply(lambda x : decode_map[x])
df.head()

In [None]:
target_cnt = Counter(df.target)

plt.figure(figsize = (10,6))
plt.bar(target_cnt.keys(),target_cnt.values())
plt.title("Dataset labels distributions")
plt.show()

## Pre-process data 

Using stopwords might reduce the model performance. Some words like 'not' are included in stopwords and ignoring them will make sentences like 'this was good' and 'this was not good' have same predictions.

In [None]:
stop_words = stopwords.words("english")
stop_words.remove("not") 
stemmer = SnowballStemmer("english")

In [None]:

def preprocess(text,stem = False) :
    pattern = r'@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+' #r'@\S+|http\S+|[^A-Za-z0-9]+' 
    text = re.sub(pattern, " ",str(text).lower()).strip()
    text = " ".join([text_ for text_ in text.split() if text_ not in stop_words])
    if stem :
        
        text = " ".join([stemmer.stem(text_) for text_ in text.split()])
    
    return text
    
df.text = df.text.apply(lambda x : preprocess(x))
df.head()

# split train/test 

In [None]:
df_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)
print("Train size :",len(df_train))
print("Test size :",len(df_test))

# Word2Vec

Word2Vec va nous servir pour la couche d'embedding, donc pas tout de suite

In [None]:

documents = [_text.split() for _text in df_train.text]

W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

w2v_model = gensim.models.word2vec.Word2Vec(size = W2V_SIZE,
                                           window = W2V_WINDOW,
                                           min_count = W2V_MIN_COUNT,
                                           workers = 8)
w2v_model.build_vocab(documents)

In [None]:
words = w2v_model.wv.vocab.keys()
vocab_size = len(words)
print("Vocab size :",vocab_size)

In [None]:

w2v_model.train(documents, total_examples = len(documents), epochs = W2V_EPOCH)

In [None]:
w2v_model.most_similar("love")

# Tokenize Text

In [None]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train.text) #chaque mot est associé à un numéro, par exemple "good" : 65645

vocab_size = len(tokenizer.word_index) + 1 # +1 pour UNK words
print("Total words", vocab_size)

In [None]:


SEQUENCE_LENGTH = 300

#on transforme chaque phrase en suite d'element indicé grace au tokenizer
#on pad pour avoir la même longueur
x_train = pad_sequences(tokenizer.texts_to_sequences(df_train.text), maxlen = SEQUENCE_LENGTH) 
x_test = pad_sequences(tokenizer.texts_to_sequences(df_test.text),maxlen = SEQUENCE_LENGTH)

# Label Encoder

0 c'est sentiment positive et 1 c'est sentiment négatif, il faut que chaque élément soit sous forme de liste [1] ou [0] pour que keras marche 

In [None]:
encoder = LabelEncoder()
encoder.fit(df_train.target.tolist())
y_train = encoder.transform(df_train.target.tolist())
y_test = encoder.transform(df_test.target.tolist())

y_train = y_train.reshape(-1,1)
y_test = y_test.reshape(-1,1)


print("y_train",y_train.shape)
print("y_test",y_test.shape)

In [None]:
print("x_train", x_train.shape)
print("y_train", y_train.shape)
print()
print("x_test", x_test.shape)
print("y_test", y_test.shape)

# Embedding layer

on se sert de word2vec

In [None]:
embedding_matrix = np.zeros((vocab_size,W2V_SIZE)) #ligne de la matrice d'embedding sont les mots et les colonnes l'encodage du mot

for word, i in tokenizer.word_index.items(): #tokenizer.word_index c'est le dico {mot : i for (mot,i) in enumerate(nombres_mots)}
    if word in w2v_model.wv : #si le mot est dans le w2v construit
        embedding_matrix[i] = w2v_model.wv[word] #on met à la ligne correspond au mot son encodage w2v

print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(vocab_size, W2V_SIZE , weights = [embedding_matrix], 
                            input_length = SEQUENCE_LENGTH, trainable = False)

# Model

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100,dropout = 0.2, recurrent_dropout = 0.2))
model.add(Dense(1,activation = 'sigmoid'))

model.summary()

In [None]:
model.compile(loss = "binary_crossentropy", 
             optimizer = "adam",
             metrics = ["accuracy"])

In [None]:
callbacks = [ ReduceLROnPlateau(monitor="val_loss", patience = 3, cooldown = 0),
            EarlyStopping(monitor = "val_acc", min_delta = 1e-4, patience = 5)]

# Train

In [None]:

EPOCHS = 5
BATCH_SIZE = 1024


history = model.fit(x_train, y_train,
                   batch_size = BATCH_SIZE,
                   epochs = EPOCHS,
                   validation_split = 0.1,
                   verbose = 1,
                   callbacks = callbacks)

# Evaluate

In [None]:
%%time 
score = model.evaluate(x_test,y_test, batch_size = BATCH_SIZE)
print()
print("ACCURACY :",score[1])
print("LOSS :",score[0])

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']


epochs = range(len(acc))

plt.plot(epochs, acc, 'b', label = 'Training acc' )
plt.plot(epochs , val_acc,'r', label = 'validation_acc')
plt.title('Training and validation accuracy')
plt.legend()

plt.figure()

plt.plot(epochs, loss, 'b', label = 'Training loss')
plt.plot(epochs, val_loss, 'r', label = 'Validation loss')
plt.title("Training and validation loss")
plt.legend()

plt.show()

In [None]:
SENTIMENT_THRESHOLDS = (0.4, 0.7)
SEQUENCE_LENGTH = 300
def decode_sentiment(score, include_neutral = True):
    if include_neutral : 
        label = "NEUTRAL"
        if score <= SENTIMENT_THRESHOLDS[0] :
            label = "NEGATIVE"
        elif score >= SENTIMENT_THRESHOLDS[1] :
            label = "POSITIVE"
        
        return label
    else : 
        return "NEGATIVE" if score < 0.5 else "POSITIVE"
    
    

In [None]:
#on doit preprocesser les phrases que l'on passe dans le modèle et cela nous permet de le faire
def predict(text, include_neutral = True) : 
    start_at = time.time()
    
    #on doit tokenizer la phrase puis pad la séquences pour le passer dans notre modèle
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen = SEQUENCE_LENGTH)
    
    score = model.predict([x_test])[0] #on trouve la probabilité associé
    
    label = decode_sentiment(score,include_neutral = include_neutral) #on la classe en positive ou négative
    
    return {"label" : label , "score": float(score),
           "elapsed time": time.time()-start_at}


In [None]:
predict("I love the music")

In [None]:
predict("I hate the rain")


In [None]:
predict("i don't know what i'm doing")

In [None]:
predict("The euroscepticism gain european countries",include_neutral = False)

# Confusion matrix

In [None]:
%%time 
y_pred_1d = []
y_test_1d = list(df_test.target)
scores = model.predict(x_test, verbose = 1, batch_size = 8000) #donne une proba
y_pred_1d = [decode_sentiment(score, include_neutral = False) for score in scores] #transforme en positive/negative/neutral

In [None]:
def plot_confusion_matrix(cm, classes, title = 'Confusion matrix', cmap = plt.cm.Blues) : 
    
    "Print and plot the confusion matrix"
    
    cm = cm.astype('float') / cm.sum(axis= 1) [:,np.newaxis]
    
    plt.imshow(cm, interpolation = 'nearest', cmap = cmap)
    plt.title(title, fontsize = 30)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation = 90, fontsize = 32)
    plt.yticks(tick_marks, classes, fontsize = 22)
    
    fmt = '.2f'
    thresh = cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]),range(cm.shape[1])) :
        plt.text(j,i, format(cm[i,j], fmt),
                horizontalalignment = 'center',
                color = "white" if cm[i,j] > thresh else "black") 
        
    plt.ylabel("True label", fontsize = 25)
    plt.xlabel("Predict label", fontsize = 25)

In [None]:
cnf_matrix

In [None]:
%%time 

cnf_matrix = confusion_matrix(y_test_1d,y_pred_1d) #donne le résultat de la matrice 
plt.figure(figsize = (12,12))
plot_confusion_matrix(cnf_matrix, classes = df_train.target.unique(), title = "Confusion matrix")
plt.show()

# Classification report

In [None]:
print(classification_report(y_test_1d, y_pred_1d))

# Accuracy Score

In [None]:
accuracy_score(y_test_1d, y_pred_1d)

# Save model

The models are saved for use later.

You can load the models and then use the predict function to predict sentiment for the text.

Keep in mind that you need to preprocess the text and encode it before prediction.

In [None]:
# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"



model.save(KERAS_MODEL)
w2v_model.save(WORD2VEC_MODEL)
pickle.dump(tokenizer, open(TOKENIZER_MODEL,"wb"),protocol = 0)
pickle.dump(encoder, open(ENCODER_MODEL,"wb"), protocol = 0)

# Load model 

We charge the model and we continue to train the model, tokenizer, word2vec ect..

In [None]:
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"
model = keras.models.load_model("model.h5")
tokenizer = pickle.load(open(TOKENIZER_MODEL,"rb"))
encoder = pickle.load(open(ENCODER_MODEL,"rb"))
w2v_model = Word2Vec.load("model.w2v")

In [None]:
predict("The country is losing a lot of money unfortunately")

In [None]:
predict("UAE is in good shape")

In [None]:
predict("How the EU is helping railways ride out Covid-19", False)

In [None]:
predict("France chaos: Macron faces Frexit demands after EU 'abandoned' states during pandemic",False)

In [None]:
# maintenant il faut prédire les tweets de mon dataset et voir comment ça marche


In [None]:
df_test = pd.read_csv("sentiment_analysis_english.csv")

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
SENTIMENT_THRESHOLDS = (0.4, 0.7)
SEQUENCE_LENGTH = 300
def decode_sentiment_test(score, include_neutral = True):
    if include_neutral : 
        label = 2
        if score <= SENTIMENT_THRESHOLDS[0] :
            label = 0
        elif score >= SENTIMENT_THRESHOLDS[1] :
            label = 4
        
        return label
    else : 
        return 0 if score < 0.5 else 4
    
    
def predict_test(text, include_neutral = True) : 
    start_at = time.time()
    
    #on doit tokenizer la phrase puis pad la séquences pour le passer dans notre modèle
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen = SEQUENCE_LENGTH)
    
    score = model.predict([x_test])[0] #on trouve la probabilité associé
    
    label = decode_sentiment_test(score,include_neutral = include_neutral) #on la classe en positive ou négative
    
    return label


In [None]:
predict_test(df_test['text'].iloc[2])

In [None]:
predict_test('I am fine')

In [None]:
Good_predictions_total = 0

for i in range(250):
    predicted = predict_test(df_test['text'].iloc[i])
    if df_test['label'].iloc[i] == predicted:
        Good_predictions_total += 1

In [None]:
Good_predictions_total

In [None]:
SENTIMENT_THRESHOLDS = (0.2, 0.8)
SEQUENCE_LENGTH = 300
def decode_sentiment_test(score, include_neutral = True):
    if include_neutral : 
        label = 2
        if score <= SENTIMENT_THRESHOLDS[0] :
            label = 0
        elif score >= SENTIMENT_THRESHOLDS[1] :
            label = 4
        
        return label
    else : 
        return 0 if score < 0.5 else 4
    
    
def predict_test(text, include_neutral = True) : 
    start_at = time.time()
    
    #on doit tokenizer la phrase puis pad la séquences pour le passer dans notre modèle
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen = SEQUENCE_LENGTH)
    
    score = model.predict([x_test])[0] #on trouve la probabilité associé
    
    label = decode_sentiment_test(score,include_neutral = include_neutral) #on la classe en positive ou négative
    
    return label


In [None]:
Good_predictions_total = 0

predicted_list = []

for i in range(250):
    predicted = predict_test(df_test['text'].iloc[i])
    predicted_list.append(predicted)
    if df_test['label'].iloc[i] == predicted:
        Good_predictions_total += 1

In [None]:
Good_predictions_total

In [None]:
print(classification_report(df_test['label'], predicted_list))

In [None]:
df_f = pd.DataFrame()
df_f['text'] =df_test.text
df_f['label'] =df_test.label
df_f['pred'] =predicted_list

In [None]:
pd.set_option('display.max_colwidth', -1)
df_f[df_f.label != df_f.pred]

In [None]:
predict_test('I am happy')

In [None]:
predict_test('I am angry')

In [None]:
predict_test('what are you talking about ?')

In [None]:
predict_test('I have to be fine, but actually I am not')

In [None]:
predict('We are not safe anymore')

In [None]:
predict_test('We are not safe anymore')

In [None]:
SENTIMENT_THRESHOLDS = (0.2, 0.85)
SEQUENCE_LENGTH = 300
def decode_sentiment_test(score, include_neutral = True):
    if include_neutral : 
        label = 2
        if score <= SENTIMENT_THRESHOLDS[0] :
            label = 0
        elif score >= SENTIMENT_THRESHOLDS[1] :
            label = 4
        
        return label
    else : 
        return 0 if score < 0.5 else 4
    
    
def predict_test(text, include_neutral = True) : 
    start_at = time.time()
    
    #on doit tokenizer la phrase puis pad la séquences pour le passer dans notre modèle
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen = SEQUENCE_LENGTH)
    
    score = model.predict([x_test])[0] #on trouve la probabilité associé
    
    label = decode_sentiment_test(score,include_neutral = include_neutral) #on la classe en positive ou négative
    
    return label

In [None]:
Good_predictions_total = 0

for i in range(250):
    predicted = predict_test(df_test['text'].iloc[i])
    if df_test['label'].iloc[i] == predicted:
        Good_predictions_total += 1

In [None]:
Good_predictions_total

In [None]:
# translator

from googletrans import Translator, constants
from pprint import pprint

# init the Google API translator
translator = Translator()


translation = translator.translate("والله من الضروري يكون فيه اصلاحات عشان المواطن يتنفس شوي دخيل الله ، الرواتب م تكفى تفاقم الاسعار اللي حاصل كيف ، و العالم ف تطور و نهضه اجتماعيه و اقتصاديه و محليه و اغلب الرواتب ٤٠٠٠ ريال م تعيش فرد ف ما بالك عوائل عايشه ع كذا ، الله المستعان ..", dest="en", src="ar")
print(f"{translation.origin} ({translation.src}) --> {translation.text} ({translation.dest})")

In [None]:
translation.text

In [None]:
predict(translation.text)

In [None]:
translation = translator.translate("العراق لو استغل موارده الطبيعية فقط دون إصلاحات اقتصادية مستدامه لكان قائدًا لاهم منطقة في العالم وهي الشرق الاوسط ، اجتمع غباء سياسييه مع غدر جيرانه ايران وتركيا .. حالهم مؤسف واقصى امانيهم كهرباء تعمل لنصف يوم فقط !", dest="en", src="ar")

In [None]:
translation.text

In [None]:
predict(translation.text)

In [None]:
SENTIMENT_THRESHOLDS = (0.37, 0.85)