In [1]:
""" Prepare Notebook for Google Colab """
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [11]:
##### Google Colab #####
# Specify directory of course materials in Google Drive
module_dir = '/content/drive/My Drive/Colab Notebooks/COMP3359/Project/'

# Install necessary packages
!pip install pyspellchecker 

import sys
import pandas as pd
import string
import os
import numpy as np
import nltk
import math
import tqdm
import tensorflow as tf
import re as regex

sys.path.append(module_dir)
nltk.download('punkt')
nltk.download('stopwords')
from nltk.tokenize import casual_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
porter = PorterStemmer()
from spellchecker import SpellChecker
spell = SpellChecker(language=None, case_sensitive=True)
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, GlobalAveragePooling1D, Dropout
from tensorflow.keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow import keras

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
##### Initialize Classes #####
class TwitterData_Initialize():
    data = []
    processed_data = []
    
    def initialize(self, twitter_data):
        self.data = twitter_data
        self.processed_data = self.data

class TwitterCleanuper:
    def iterate(self):
        for cleanup_method in [self.remove_urls,
                               self.remove_usernames,
                               self.remove_numbers,
                               self.remove_special_chars,
                               self.remove_duplicate_characters,
                               self.correct_spelling,
                               self.remove_stopwords]:
            yield cleanup_method

    def remove_by_regex(tweets, regexp):
        tweets.loc[:, "tweet"].replace(regexp, "", inplace=True)
        return tweets

    def remove_urls(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"http.?://[^\s]+[\s]?"))

    def remove_special_chars(self, tweets):  # it unrolls the hashtags to normal words
        for remove in map(lambda r: regex.compile(regex.escape(r)), [",", ":", "\"", "=", "&", ";", "%", "$",
                                                                     "@", "%", "^", "*", "(", ")", "{", "}",
                                                                     "[", "]", "|", "/", "\\", ">", "<", "-",
                                                                     "!", "?", ".", "'",
                                                                     "--", "---", "#"]):
            tweets.loc[:, "tweet"].replace(remove, "", inplace=True)
        return tweets

    def remove_usernames(self, tweets):
        return TwitterCleanuper.remove_by_regex(tweets, regex.compile(r"@[^\s]+[\s]?"))

    def remove_numbers(self, tweets):
        tweets["tweet"] = tweets["tweet"].apply(lambda x: regex.sub("\S*\d\S*", "", x).strip()) # to remove word entirely
        return tweets
      
    def remove_duplicate_characters(self, tweets):
        tweets["tweet"] = tweets["tweet"].apply(lambda x: regex.sub(r'([a-zA-Z])\1\1+', r'\1\1', x))
        return tweets

    def correct_spelling(self, tweets):
        tweets["tweet"] = tweets["tweet"].apply(lambda x: spell.correction(x).lower())
        return tweets

    def remove_stopwords(self, tweets):
        pattern = regex.compile(r'\b(' + r'|'.join(stopwords.words('english')) + r')\b\s*')
        tweets["tweet"] = tweets["tweet"].apply(lambda x: pattern.sub('', x))
        return tweets


class TwitterData_Cleansing(TwitterData_Initialize):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    def cleanup(self, cleanuper):
        t = self.processed_data
        for cleanup_method in cleanuper.iterate():
            t = cleanup_method(t)
        self.processed_data = t

class TwitterData_TokenStem(TwitterData_Cleansing):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
        
    def stem(self, stemmer = porter ):
        def stem_and_join(tweets):
            tweets["tweet"] = list(map(lambda str: stemmer.stem(str.lower()), tweets['tweet']))
            return tweets
        self.processed_data = self.processed_data.apply(stem_and_join, axis=1)

    def tokenize(self, tokenizer = casual_tokenize):
        t = self.processed_data
        t['tweet'] = t['tweet'].apply(tokenizer).tolist()
        # t['tokenized_tweet'] = t['tweet']
        self.processed_data = t

class TwitterData_Padding(TwitterData_TokenStem):
    def __init__(self, previous):
        self.processed_data = previous.processed_data
 
    def padding(self, tokenizer = Tokenizer()):
        t = self.processed_data
        self.tokenizer = tokenizer
        t['token'] = t['tweet']
        pd.Series(t['token'])
        tokenizer.fit_on_texts(t['token'])
        t['token'] = tokenizer.texts_to_sequences(t['token'])  
        self.processed_data = t

In [4]:
# Replace the emoticons
import re as regex

positive_emoticons = [":)", ":-)", ":p", ":-p", ":P", ":-P", ";D", ";-D", ":D",":-D", ":]", ":-]", ";)", ";-)", ";p", ";-p", ";P", ";-P", ";D", ";-D", ";]", ";-]", "=)", "=-)", "<3"]
negative_emoticons = [":o", "D;", "D:", "D-:", ":-o", ":O", ":-O", ":(", ":-(", ":c", ":-c", ":C", ":-C", ":[", ":-[", ":/", ":-/", ":\\", ":-\\", ":n", ":-n", ":u", ":-u", "=(", "=-(", ":$", ":-$"]

def make_emoticon_pattern(emoticons):
    pattern = "|".join(map(regex.escape, emoticons))
    pattern = "(?<=\s)(" + pattern + ")(?=\s)"
    return pattern

def replace_emoticons(tweets, pattern, tag):
    tweets["tweet"]= tweets["tweet"].apply(lambda x: regex.sub(pattern, tag, " " + x + " "))
    return tweets


In [5]:
def truncating(dataframe):
  count = []
  for i in range(len(dataframe['token'])):
    count.append(len(dataframe['token'][i]))
  dataframe['count'] = count
  dataframe = dataframe[(dataframe[['count']] != 0).all(axis=1)]
  mean_count = dataframe['count'].sum() / len(dataframe.index)
  int_mean = math.ceil(mean_count)
  postpad_maxlen_posttrunc = pad_sequences(dataframe['token'], padding = 'post', maxlen = int_mean, truncating = 'post')
  dataframe = dataframe.drop(['token','count'], axis=1)
  return dataframe, postpad_maxlen_posttrunc

In [6]:
def Preprocess(dataframe):
  data = TwitterData_Initialize()
  data.initialize(dataframe)
  data.processed_data = replace_emoticons(data.processed_data, make_emoticon_pattern(positive_emoticons), 'positive_emoticons')
  data.processed_data = replace_emoticons(data.processed_data, make_emoticon_pattern(negative_emoticons), 'negative_emoticons')
  data = TwitterData_Cleansing(data)
  data.cleanup(TwitterCleanuper())
  data = TwitterData_TokenStem(data)
  data.tokenize()
  data.stem()
  data = TwitterData_Padding(data)
  data.padding()
  word_index = data.tokenizer.word_index
  truncated_data, postpad_maxlen_posttrunc = truncating(data.processed_data)
  return truncated_data, postpad_maxlen_posttrunc, word_index

In [7]:
def construct_embedding_matrix(glove_file, word_index, EMBEDDING_VECTOR_LENGTH = 100 ):
    embedding_dict = {}
    with open(glove_file,'r', encoding='utf8') as f:
        for line in f:
            values=line.split()
            # get the word
            word=values[0]
            if word in word_index.keys():
                # get the vector
                vector = np.asarray(values[1:], 'float32')
                embedding_dict[word] = vector
    ###  oov words (out of vacabulary words) will be mapped to 0 vectors

    num_words=len(word_index)+1
    #initialize it to 0
    embedding_matrix=np.zeros((num_words, EMBEDDING_VECTOR_LENGTH))

    for word,i in tqdm.tqdm(word_index.items()):
        if i < num_words:
            vect=embedding_dict.get(word, [])
            if len(vect)>0:
                embedding_matrix[i] = vect[:EMBEDDING_VECTOR_LENGTH]
    return embedding_matrix

def Prediction(embedding_matrix, postpad_maxlen_posttrunc, weight_file):
  x_test = postpad_maxlen_posttrunc
  VOCAB_SIZE = embedding_matrix.shape[0]
  EMBEDDING_DIM = 100
  model2 = Sequential(
    [
      Embedding(VOCAB_SIZE, EMBEDDING_DIM, weights = [embedding_matrix], trainable = False),
      Bidirectional(LSTM(EMBEDDING_DIM, dropout=0.1, return_sequences=True)),
      Bidirectional(LSTM(EMBEDDING_DIM)),
      Dense(24, activation='relu'), 
      Dense(24, activation='relu'),
      Dense(1, activation='sigmoid')
    ]
  )
  adam = tf.keras.optimizers.Adam(learning_rate = 0.01)
  model2.compile(loss = 'binary_crossentropy', optimizer = adam, metrics = ['accuracy'])
  model2.load_weights(weight_file)
  prediction = model2.predict(x_test)
  return np.round(prediction)

def modelpred(dataframe,postpad_maxlen_posttrunc,path):
  model = keras.models.load_model(path)
  prediction = model2.predict(postpad_maxlen_posttrunc)
  return np.round(prediction)

In [8]:
def Tweets_Analysis(dataframe):
  Processed_data, x_test, word_index = Preprocess(dataframe)
  model = keras.models.load_model('/content/drive/My Drive/COMP3359 Project/model.h5') #change the file path
  predicts = model.predict(x_test) 
  Processed_data['predict'] = np.round(predicts)
  return  Processed_data

In [12]:
ans=Tweets_Analysis(data)  # input the data frame here



In [14]:
Processed_data['predict'] = np.round(prediction)
Processed_data

Unnamed: 0,target,tweet,predict
0,0,"[aww, that, bummer, shoulda, got, david, carr,...",0.0
1,0,"[upset, cant, updat, facebook, text, might, cr...",0.0
2,0,"[dive, mani, time, ball, manag, save, rest, go...",0.0
3,0,"[whole, bodi, feel, itchi, like, fire]",0.0
4,0,"[behav, im, mad, cant, see]",0.0
...,...,...,...
1599995,4,"[woke, school, best, feel, ever]",1.0
1599996,4,"[thewdbcom, cool, hear, old, walt, interview, ...",1.0
1599997,4,"[readi, mojo, makeov, ask, detail]",1.0
1599998,4,"[happi, birthday, boo, time, tupac, amaru, sha...",1.0


In [46]:
ans.head(50)

Unnamed: 0,target,tweet,predict
0,0,"[aww, that, bummer, shoulda, got, david, carr,...",0.0
1,0,"[upset, cant, updat, facebook, text, might, cr...",0.0
2,0,"[dive, mani, time, ball, manag, save, rest, go...",0.0
3,0,"[whole, bodi, feel, itchi, like, fire]",0.0
4,0,"[behav, im, mad, cant, see]",0.0
5,0,"[whole, crew]",1.0
6,0,"[need, hug]",0.0
7,0,"[hey, long, time, see, ye, rain, bit, bit, lol...",0.0
8,0,"[nope, didnt]",0.0
9,0,"[que, muera]",0.0
