In [1]:
import pandas as pd
import numpy as np
import re
import emoji2
from nltk.tokenize import TweetTokenizer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import sys
from unidecode import unidecode

In [2]:
tknzr = TweetTokenizer()
lemmatizer = WordNetLemmatizer()
notstopwords = set(('not', 'no'))
stopwords = set(stopwords.words('english')) - notstopwords

In [3]:
def standardization(tweet):
    tweet = re.sub(r"\\u2019", "'", tweet)
    tweet = re.sub(r"\\u002c", "'", tweet)
    tweet=' '.join(emoji2.str2emoji(unidecode(tweet).lower().split()))
    tweet = re.sub(r"(http|https)?:\/\/[a-zA-Z0-9\.-]+\.[a-zA-Z]{2,4}(/\S*)?", " ", tweet)
    tweet = re.sub(r"\'ve", " have", tweet)
    tweet = re.sub(r" can\'t", " cannot", tweet)
    tweet = re.sub(r"n\'t", " not", tweet)
    tweet = re.sub(r"\'re", " are", tweet)
    tweet = re.sub(r"\'d", " would", tweet)
    tweet = re.sub(r"\'ll", " will", tweet)
    tweet = re.sub(r"\'s", "", tweet)
    tweet = re.sub(r"\'n", "", tweet)
    tweet = re.sub(r"\'m", " am", tweet)
    tweet = re.sub(r"@\w+", r' ',tweet)
    tweet = re.sub(r"#\w+", r' ',tweet)
    tweet = re.sub(r" [0-9]+ "," ",tweet)
    tweet = [lemmatizer.lemmatize(i,j[0].lower()) if j[0].lower() in ['a','n','v']  else lemmatizer.lemmatize(i) for i,j in pos_tag(tknzr.tokenize(tweet))]
    tweet = [ i for i in tweet if (i not in stopwords) and (i not in punctuation ) ]
    tweet = ' '.join(tweet)
    return tweet

In [4]:
def data_preprocessing_test (path_tweets):
    data = pd.read_csv(path_tweets,encoding='utf-8',sep='\t',names=['id','class','tweets'])
    data['class'] = data['class'].apply(lambda x:0 if x=='negative' else (1 if x=='neutral' else 2))
    data['tweets'] = data['tweets'].apply(lambda x:standardization(x))
    return data['tweets'],data['class']

In [5]:
tweets_train_3, sentiments_train_3 = data_preprocessing_test('D:/EPITA/DSA_Classes/NLP/data_train_3.csv')

In [6]:
lemmatizer.lemmatize('Paris','v')

'Paris'

In [7]:
pos_tag(['Paris','is'])

[('Paris', 'NNP'), ('is', 'VBZ')]

In [8]:
from collections import Counter
#from preprocessing import standardization, data_preprocessing
import sys
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from gensim.models import KeyedVectors
from keras.layers.embeddings import Embedding
from keras.models import Sequential, Model
from keras.utils.np_utils import to_categorical
from sklearn.model_selection import train_test_split
from keras.layers import LSTM, Dropout, Dense, Bidirectional,  Flatten, Input, GRU
import matplotlib as mpl
from keras.optimizers import Adam
#mpl.use('TkAgg')  # or whatever other backend that you want
#import matplotlib.pyplot as plt
np.random.seed(1337)

Using TensorFlow backend.


In [9]:
tokenizer = Tokenizer(filters=' ')
tokenizer.fit_on_texts(tweets_train_3)
sequences = tokenizer.texts_to_sequences(tweets_train_3)
word_index = tokenizer.word_index

In [10]:
sequences_train_3=tokenizer.texts_to_sequences(tweets_train_3)

In [11]:
sequences_train_3

[[42, 227, 780, 2, 1051, 6, 3031, 6, 426, 12042, 426, 1],
 [1234, 24, 1064, 97, 8, 49, 12043, 75, 57],
 [23, 691, 1547, 797, 588, 1587, 96, 14, 619, 503, 191, 816, 107],
 [906, 21, 252, 2606, 200, 2607, 70, 536, 16818, 24, 1065, 1715, 1],
 [941, 143, 588, 1587, 830, 486, 798, 16819, 378, 97, 591, 12, 9],
 [1214, 3399, 3, 16820, 582, 2678, 2608, 52],
 [16821, 16822, 2, 1744, 64, 50, 30, 9, 4143, 37, 16823],
 [3032, 3256, 684, 4144, 606, 1417, 2107, 352, 162, 1243, 27],
 [1125, 24, 1548, 629, 1, 4881, 719, 1, 335, 30, 5, 39],
 [16824, 486, 2482, 1691, 83, 60, 12, 1, 1745, 922, 1, 28, 16825],
 [568, 672, 923, 34, 1673, 564, 8618, 4882, 3, 89, 7, 1472, 1281, 64, 50],
 [1434, 1513, 112, 316, 42, 88, 225, 451, 1066, 373, 233, 5697, 18],
 [5, 1103, 20, 3400, 76, 53, 36],
 [8619, 110, 2940, 72, 3989, 12, 2420, 395, 854],
 [715, 1764, 2483, 26, 279, 1764, 2483, 29, 40, 8620, 59, 428, 64, 50],
 [5, 620, 1, 410, 647, 20, 2],
 [206, 378, 171, 7, 285, 1201, 22, 41, 537, 102, 15, 23, 102, 1403],
 [3

In [12]:
MAX_SEQUENCE_LENGTH = 0
for elt in sequences_train_3:
    if len(elt) > MAX_SEQUENCE_LENGTH:
        MAX_SEQUENCE_LENGTH = len(elt)

In [13]:
data_train_3 = pad_sequences(sequences_train_3, maxlen=MAX_SEQUENCE_LENGTH)

In [14]:
import numpy as np
indices_train_3 = np.arange(data_train_3.shape[0])
data_train_3 = data_train_3[indices_train_3]

In [15]:
labels_train_3 = to_categorical(np.asarray(sentiments_train_3),3)
labels_train_3 = labels_train_3[indices_train_3]

In [16]:
split_idx = int(len(data_train_3)*0.85)
x_train_3, x_val_3 = data_train_3[:split_idx], data_train_3[split_idx:]
y_train_3, y_val_3 = labels_train_3[:split_idx], labels_train_3[split_idx:]
x_train_3.shape, y_train_3.shape

((42783, 32), (42783, 3))

In [17]:
nb_words=len(word_index)+1
oov=[]
EMBEDDING_DIM=300
oov.append((np.random.rand(EMBEDDING_DIM) * 2.0) - 1.0)
oov = oov / np.linalg.norm(oov)
EMBEDDING_FILE="D:/EPITA/DSA_Classes/NLP/GoogleNews-vectors-negative300.bin/GoogleNews-vectors-negative300.bin"
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [18]:
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
    else:
        embedding_matrix[i] = oov

embedding_layer = Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix],input_length=MAX_SEQUENCE_LENGTH,trainable=False, name ='embedding_layer')
            

In [20]:
model1 = Sequential()
model1.add(embedding_layer)
model1.add(Bidirectional(LSTM(64, return_sequences=True)))
model1.add(Dropout(0.2))
model1.add(Bidirectional (LSTM(64, return_sequences=True)))
model1.add(Flatten())
model1.add(Dense(3, activation='softmax'))
model1.compile(loss='categorical_crossentropy', optimizer='Adam', metrics=['acc'])
model1.summary()
history=model1.fit(x_train_3,y_train_3,validation_data=(x_val_3,y_val_3),epochs=7,batch_size=50)
model1.save("./model1.h5")

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_layer (Embedding)  (None, 32, 300)           11090400  
_________________________________________________________________
bidirectional_3 (Bidirection (None, 32, 128)           186880    
_________________________________________________________________
dropout_2 (Dropout)          (None, 32, 128)           0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 32, 128)           98816     
_________________________________________________________________
flatten_1 (Flatten)          (None, 4096)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 3)                 12291     
Total params: 11,388,387
Trainable params: 297,987
Non-trainable params: 11,090,400
__________________________________________________________

In [34]:
ypred = model1.predict(x_val_3)
ypred

for i in range(0,7550):
    for j in (0,1,2):
        if ypred[i,j] >= 0.5:
            ypred[i,j] = 1
        else:
            ypred[i,j] = 0

array([[0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.]], dtype=float32)

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
asc = accuracy_score(y_val_3,ypred)
ps = precision_score(y_val_3,ypred, average='macro')
rs = recall_score(y_val_3,ypred, average='macro')
print("Accuracy Score:",asc,"Precision Score:",ps,"Recall:",rs,'\n')

Accuracy Score: 0.5667549668874172 Precision Score: 0.6206130140306425 Recall: 0.5483616079287944 

