In [None]:
!nvidia-smi

In [None]:
!pip install textblob 'keras-nlp' 'keras-preprocessing' 'tensorflow-text==2.15.0'' np_utils swifter

In [None]:
import multiprocessing
import tensorflow as tf
import sys
import keras.backend as K
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda, ELU, Conv1D, MaxPooling1D, Dropout
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from textblob import TextBlob, Word
from keras_preprocessing.sequence import pad_sequences
from keras.initializers import Constant
import numpy as np
import random
import os
import pandas as pd
import gensim
import warnings
import nltk
import pickle
from tensorflow.nn import leaky_relu

import re
import warnings
from sklearn.model_selection import train_test_split
from textblob import TextBlob

TRACE = False
embedding_dim = 300
epochs=100
batch_size = 250
corpus_size=25000
BATCH = True

def set_seeds_and_trace():
  os.environ['PYTHONHASHSEED'] = '0'
  np.random.seed(42)
  tf.random.set_seed(42)
  random.seed(42)
  if TRACE:
    tf.debugging.set_log_device_placement(True)

def set_session_with_gpus_and_cores():
  cores = multiprocessing.cpu_count()
  gpus = len(tf.config.list_physical_devices('GPU'))
  config = tf.compat.v1.ConfigProto( device_count = {'GPU': gpus  , 'CPU': cores} , intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
  sess = tf.compat.v1.Session(config=config)
  tf.compat.v1.keras.backend.set_session(sess)

set_seeds_and_trace()
set_session_with_gpus_and_cores()
warnings.filterwarnings('ignore')
nltk.download('punkt')
textblob_tokenizer = lambda x: TextBlob(x).words

In [None]:
%%writefile get_data.sh
if [ ! -f news.csv ]; then
  wget -O news.csv https://www.dropbox.com/s/352x7xzivf60zgc/news.csv?dl=0
fi

In [None]:
!bash get_data.sh

In [None]:
path = './news.csv'
news_pre = pd.read_csv(path, header=0).sample(n=corpus_size).reset_index(drop=True)

In [None]:
news_pre

In [None]:
import re
def preprocess_text(text, should_join=True):
    # Use the tokenizer to tokenize into words, lowercase them, remove punctuation, and finally use gensim.utils.simple_preprocess(text)
    text = ' '.join(gensim.utils.tokenize(text, lowercase=True))
    text = re.sub(r"[.,!?]", r" ", text)
    if should_join:
      return ' '.join(gensim.utils.simple_preprocess(text))
    else:
      return gensim.utils.simple_preprocess(text)

In [None]:
preprocess_text('This is the best night of my life! Is it? Well, maybe')

In [None]:
import swifter
news = news_pre.title.swifter.apply(preprocess_text)

In [None]:
news.to_csv('news_pre.csv', header=False, index=False)

In [None]:
!head -n 5 news_pre.csv

In [None]:

class MyCorpus:
    """An iterator that yields sentences (lists of str)."""

    def __iter__(self):
        corpus_path = 'news_pre.csv'
        for line in open(corpus_path):
            # assume there's one document per line, tokens separated by whitespace
            yield preprocess_text(line, should_join=False)

from gensim.models import Word2Vec
word2vec_model = Word2Vec(sentences=MyCorpus(), vector_size=embedding_dim)
# Get a word2vec model using gensim.models and passing the sentences using MyCorpus()

In [None]:
wv_model = word2vec_model.wv

In [None]:
weights = tf.constant(wv_model.vectors)  # Get the weights of the model (the embedding) and convert to tensor. Hint: Check word2vec_model.wv
vocab_size = len(wv_model.index_to_key)  # get vocab size from index_to_key in word2vec_model.wv

In [None]:
weights.shape

In [None]:
news_preprocessed = pd.DataFrame()
news_preprocessed['label'] = news_pre.category.map({'Business': 0, 'Sports': 1, 'Sci/Tech': 2, 'World': 3})
news_preprocessed['title'] = news
news_preprocessed

In [None]:
def get_maximum_review_length(df):
    maximum = 0
    for ix, row in df.iterrows():
        candidate = len(textblob_tokenizer(row.title))
        if candidate > maximum:
            maximum = candidate
    return maximum


maximum = get_maximum_review_length(news_preprocessed)   # Since 2 titles may have different number of words, we have to find the max length and fill with 0s if a title is shorter

In [None]:
maximum

In [None]:
X = np.zeros((len(news_preprocessed), maximum))   # Here we do what we said above
# Iterate through the news df and for every word, if it exists in the word2vec model, put into X for that review and that word the index of the embedding (check index_to_key)
# HINT: to iterate through a column of a pandas dataframe you do:

# for index, value in df.iterrows():
#      #do something

for index, row in news_preprocessed.iterrows():
  word_ix = 0
  for word in textblob_tokenizer(row.title):
    token = vocab_size+1
    if word in wv_model.key_to_index:
      token = wv_model.key_to_index[word]
    X[index, word_ix] = token
    word_ix += 1

# FILL
y = news_preprocessed.label

In [None]:
X[:2]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = tf.constant(X_train)
X_test = tf.constant(X_test)
# Convert y_train and y_test from an array of values between 0-3 to a one hot matrix tensor
y_train = tf.one_hot(y_train, 4)
y_test = tf.one_hot(y_test, 4)


In [None]:
model = Sequential()
model.add(Embedding(input_dim=weights.shape[0], output_dim=weights.shape[1], input_length=maximum, trainable=False, embeddings_initializer=Constant(weights)))  # Add an Embedding layer with weights being the rweights variable and trainable as False. The embedding dimension should be embedding_dim
model.add(Dense(100, activation='relu'))  # Add a couple of Dense Layers with RELU or leaky_relu activations. You may add Batch Norm if you want too
model.add(Dense(50, activation='tanh'))  # Add a couple of Dense Layers with RELU or leaky_relu activations. You may add Batch Norm if you want too
model.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(None, embedding_dim,)))  # Average out the words of the sentence. The expected out is (N, D) where N is number f samples in batch and D is embedding dimension
model.add(Dense(25, activation=leaky_relu))
model.add(Dense(4, activation='softmax'))  # Add final Dense layer

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Compile the model. Think what is the best loss to use
model.summary()

In [None]:
history = model.fit(x=X_train, y=y_train, epochs=epochs, batch_size=batch_size, workers=5, validation_data=(X_test, y_test)) # Fit the model, use the callback above to do EarlyStopping

In [None]:
import matplotlib.pyplot as plt

# function for plotting loss
def plot_metrics(train_metric, val_metric=None, metric_name=None, title=None, ylim=5):
    plt.title(title)
    plt.ylim(0,ylim)
    plt.plot(train_metric,color='blue',label=metric_name)
    if val_metric is not None: plt.plot(val_metric,color='green',label='val_' + metric_name)
    plt.legend(loc="upper right")

# plot loss history
plot_metrics(history.history['loss'], history.history['val_loss'], "Loss", "Loss", ylim=2.0)


In [None]:
plot_metrics(history.history['accuracy'], history.history['val_accuracy'], "Accuracy", "Accuracy", ylim=1.0)

In [None]:
# Test with the following two sentences:
# - 'supercomputer will put workers jobless soon'
# - 'patriots goes winning super bowl'

x_val = np.zeros((2, maximum))
for index, row in enumerate(['supercomputer will put workers jobless soon', 'patriots goes winning super bowl']):
    word_ix = 0
    for word in textblob_tokenizer(row):
      token = vocab_size+1
      if word in wv_model.key_to_index:
        token = wv_model.key_to_index[word]
      x_val[index, word_ix] = token
      word_ix += 1
y_val = tf.one_hot([0,1], depth=4)

In [None]:
x_val = tf.constant(x_val)

In [None]:
x_val

In [None]:
y_val

In [None]:
model.predict(x_val)

In [None]:
model.evaluate(X_test, y_test)