# Import Libraries

In [1]:
from unicodedata import normalize
import pandas as pd
import numpy as np
import string, os, re
import psutil
import pickle
from sklearn.model_selection import train_test_split

# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
from keras.callbacks import ModelCheckpoint
import keras.utils as ku 

# for pre-trained embeddings
import gensim
from gensim.models.word2vec import Word2Vec
from gensim.models.phrases import Phraser, Phrases
from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


## Load Datasets

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
final_file = open(os.path.join("drive/My Drive", "data/reddit_train_test_capped.pkl"),'rb')
train_df, test_df= pickle.load(final_file),  pickle.load(final_file)

final_file.close()
train_df.shape

In [None]:
# subset to only the fields I will need
train_df = train_df[['score','body','is_popular']]
test_df = test_df[['score','body','is_popular']]

In [None]:
# remove nan in body, the input
train_df.dropna(subset=['body'], inplace=True)
test_df.dropna(subset=['body'], inplace=True)

In [None]:
# Google News embeddings based on 3M words in 300 dimensions
filename = os.path.join("drive/My Drive", "data/GoogleNews-vectors-negative300.bin")
gensim_embeddings = KeyedVectors.load_word2vec_format(filename, binary=True)

pretrained_weights = gensim_embeddings.wv.syn0
vocab_size, embedding_size = pretrained_weights.shape

In [None]:
df_data = pd.read_csv(os.path.join("drive/My Drive", "data/all_tweets_clean_v2.csv"))

_, t_test_df = train_test_split(df_data, test_size=0.3)

# Dataset Prep

### Dataset Cleaning

In [None]:
def clean_text(txt):
  txt = re.sub(r'https:\/\/t[.]co\/[A-Za-z0-9]*$', '', txt)
  txt = re.sub(r'\n', ' ', txt)
  txt = "".join(v for v in txt if v not in string.punctuation).lower()
  txt = txt.encode("utf8").decode("ascii",'ignore')
  txt = re.sub(' +', ' ', txt)
  return(txt)

train_corpus = train_df['body'].apply(clean_text)
test_corpus = test_df['body'].apply(clean_text)
t_test_corpus = t_test_df['body'].apply(clean_text)

### Tokenize

In [None]:
tokenizer = Tokenizer()

# tokenize our text
tokenizer.fit_on_texts(train_corpus)
# turn text into token sequence
train_sequences = tokenizer.texts_to_sequences(train_corpus)
test_sequences = tokenizer.texts_to_sequences(test_corpus)
t_test_sequences = tokenizer.texts_to_sequences(t_test_corpus)

### Padding Sequences and Obtaining Variables: Predictors and Targets

In [None]:
x_train = pad_sequences(train_sequences, maxlen = 100)
x_test = pad_sequences(test_sequences, maxlen = 100)
t_x_test = pad_sequences(t_test_sequences, maxlen = 100)
print(t_x_test.shape)

y_train = train_df['is_popular'].tolist()
y_test = test_df['is_popular'].tolist()
t_y_test = t_test_df['is_popular'].tolist()

## Model

### Architecture

In [None]:
def create_model2(embedding_vectors):
  model = Sequential()
  
  
  model.add(Embedding(input_dim=vocab_size,
                      output_dim=embedding_size,
                      weights=[pretrained_weights],
                      trainable=False,
                      name='embedding_layer'))
  
  model.add(LSTM(100))
  
  #model.add(Dropout(0.1))
  
  model.add(Dense(1, activation='sigmoid'))
  
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  
  return(model)

model = create_model2(embedding_vectors=100)

model.summary()

### Train

In [None]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=3, batch_size=64)

In [None]:
# Final evaluation of the model
scores = model.evaluate(x_test, y_test, verbose=0)
print("Accuracy:{}".format(scores[1]*100))

In [None]:
reddit_test_df=os.path.join("drive/My Drive", "data/reddit_train_test_capped.pkl")
twitter_test_df=os.path.join("drive/My Drive", "data/twitter_train_test_smaller_v2.pkl")
  
final_file = open(reddit_test_df,'rb')
_ , reddit_test_df =  pickle.load(final_file),  pickle.load(final_file)
final_file.close()
  
final_file = open(twitter_test_df,'rb')
_ , twitter_test_df =  pickle.load(final_file),  pickle.load(final_file)
final_file.close()
  
mod = "lstm_sentence_classification"

reddit_test_df.dropna(subset=['body'], inplace=True)

reddit_test_df[mod] = model.predict(x_test)
print("twitter_test_df", twitter_test_df.shape)
print("t_x_test", t_x_test.shape)
twitter_test_df[mod] = model.predict(t_x_test)
    
reddit_test_df.to_csv('drive/My Drive/models/reddit_test_predictions_mj.csv',index=False)
twitter_test_df.to_csv('drive/My Drive/models/twitter_test_predictions_mj.csv',index=False)