In [None]:
# importacion general de librerias y de visualizacion (matplotlib y seaborn)
import pandas as pd
import numpy as np
import random
import re
import nltk

pd.options.display.float_format = '{:20,.2f}'.format # suprimimos la notacion cientifica en los outputs

import warnings
warnings.filterwarnings('ignore')


In [None]:
train_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/train_featured.csv')
test_data = pd.read_csv('~/Documents/Datos/DataSets/TP2/test_featured.csv')
train_data.head()

In [None]:
import tensorflow as tf
tf.test.gpu_device_name()

In [None]:
import gensim, logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
import time

from gensim.models.word2vec import Word2Vec
from glob import glob

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Reshape, Flatten, Dropout, Concatenate
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix

In [None]:
target = train_data['target_label'].values.tolist()
data = train_data['clean_text'].values.tolist()

In [None]:
x_train, x_test, y_train, y_test = \
    train_test_split(data,target, test_size=0.2)  # random split

In [None]:
tokenizer = Tokenizer()             #Tokenizer(num_words=5000) => 5000 words of the highest frequency
tokenizer.fit_on_texts(data)
tokenizer

In [None]:
print("len(tokenizer) :",len(list(tokenizer.word_index)))

In [None]:
x_train_tokens = tokenizer.texts_to_sequences(x_train)


In [None]:
x_test_tokens = tokenizer.texts_to_sequences(x_test)


In [None]:
num_tokens = [len(tokens) for tokens in x_train_tokens + x_test_tokens]
num_tokens = np.array(num_tokens)

In [None]:
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

In [None]:
np.sum(num_tokens < max_tokens) / len(num_tokens)


In [None]:
x_train_pad = pad_sequences(x_train_tokens, maxlen=max_tokens)


In [None]:
#Zero is added before the values given in the padding operation.

print("x_train_tokens :",x_train_tokens[0])
print("x_train_pad :",x_train_pad[0])

In [None]:
x_test_pad = pad_sequences(x_test_tokens, maxlen=max_tokens)


In [None]:
print("x_train_pad.shape :",x_train_pad.shape)
print("x_train_pad.shape :",x_test_pad.shape)

In [None]:
idx = tokenizer.word_index
inverse_map = dict(zip(idx.values(), idx.keys()))

In [None]:
def tokens_to_string(tokens):
    words = [inverse_map[token] for token in tokens if token!=0]
    text = ' '.join(words)
    return text

In [None]:
x_train[800]


In [None]:
print(x_train_tokens[800])


In [None]:
tokens_to_string(x_train_tokens[800])


In [None]:
filtered_stopwords = []
filtered_stopwords_list = []

for i in data:
    filtered_sentence = [w for w in i]
    filtered_stopwords_list.append(filtered_sentence)                         #return list value
    filtered_stopwords.append(" ".join(filtered_sentence))                    #return string value

In [None]:
#Save word2vec format (not binary)

model = Word2Vec(filtered_stopwords_list, size=1000)
model_save_location = "3000tweets_notbinary"
model.wv.save_word2vec_format(model_save_location)

In [None]:
embedding_size = 1000


In [None]:

#Word2vec load(2.option) example

word2vec = {}
with open('3000tweets_notbinary', encoding='UTF-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vec = np.asarray(values[1:], dtype='float32')
        word2vec[word] = vec

In [None]:

print("x_test[0] :",x_test[40])
print("x_test_pad[0] :",x_test_pad[0])

In [None]:
num_words = len(list(tokenizer.word_index)) + 1


In [None]:
embedding_matrix = np.random.uniform(-1, 1, (num_words, embedding_size))
for word, i in tokenizer.word_index.items():
    if i < num_words:
        embedding_vector = word2vec.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
embedding_matrix.shape


In [None]:
embedding_matrix[3]


In [None]:
sequence_length = max_tokens
vocabulary_size = num_words
embedding_dim = embedding_size
filter_sizes = [3,4,5]
num_filters = 512
drop = 0.5

In [None]:
epochs = 5
batch_size = 30

In [None]:
y_train2 = np.array([])
y_test2 = np.array([])
for i in y_train:
    y_train2 = np.append(y_train2, i)
for i in y_test:
    y_test2 =np.append(y_test2, i)

In [None]:
#CNN architecture

num_classes = 2

#Training params
batch_size = 64 
num_epochs = 25

#Model parameters
num_filters = 16  # görüntünün boyutu mesela 512*512
embed_dim = embedding_size 
weight_decay = 1e-4

print("training CNN ...")
model = Sequential()

#Model add word2vec embedding

model.add(Embedding(input_dim=num_words,
                    output_dim=embedding_size,
                    weights= [embedding_matrix],
                    input_length=max_tokens,        
                    trainable=True,              #the layer is trained
                    name='embedding_layer'))
model.add(Conv1D(num_filters, 7, activation='tanh', padding='same'))
model.add(MaxPooling1D(2))
model.add(Conv1D(num_filters, 7, activation='tanh', padding='same'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.9))
model.add(Dense(32, activation='relu', kernel_regularizer=regularizers.l2(weight_decay)))
model.add(Dense(num_classes, activation='softmax'))  #multi-label (k-hot encoding)

adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)
model.compile(loss='sparse_categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()



#define callbacks
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.001, patience=8, verbose=1)
callbacks_list = [early_stopping]


hist = model.fit(x_train_pad, y_train2, batch_size=batch_size, epochs=num_epochs, callbacks=callbacks_list, validation_split=0.1, shuffle=True, verbose=2)

In [None]:
predicted = model.predict(x_test_pad)
predicted =np.argmax(predicted, axis=1) 
np.mean(predicted == y_test2)