<a href="https://colab.research.google.com/github/benjamin-carter/NLP---preproc/blob/master/A6_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assignment 6: Twitter pre-processing and catgeorization


Connect to Google Drive to access the data

In [0]:
from google.colab import drive

drive.mount("/content/gdrive", force_remount=True)

In [0]:
ls "/content/gdrive/My Drive/Current"

## Add necessary packages for analysis

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  # Returns split words, while tf vectorizes after splitting.

!pip install pyspellchecker
from spellchecker import SpellChecker

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass
import tensorflow as tf

!pip install tensorflow-hub
!pip install tfds-nightly
import tensorflow_hub as hub
import tensorflow_datasets as tfds

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Hub version: ", hub.__version__)
print("GPU is", "available" if tf.config.experimental.list_physical_devices("GPU") else "NOT AVAILABLE")

## Retrieve Data from Google Drive

In [0]:
df = pd.read_csv("/content/gdrive/My Drive/Current/uglywords_v1.csv")


## Remove bad symbols and usernames

In [0]:
pop_list = ['#', '@', '*', '\x89Û_', '\x89Ûªt', '\x89', '\x89ÛÏ', '\x89Ûªm', '\x89ÛÓ', '\x89Ûª', '\x89Û÷', '~', 'ÛÏ', 'åÊ'
            '\x89ÛªS', 'RAZEDåÊ', '\x89ÛÒ', '...', '??', '|', '_', ' - ', 'Ûªs', '', '[', ']', '`', '(', ')', '=', '&amp;', 'Ã©', 
            "!!", "!?", "?!", 'Ûª', 'ÛÓ', '&', '*', ',', "'", "!", "?", "├"]
clean_tweets = []
for i in range(len(df)):
    # max_length = max(max_length, len(df.iloc[i].values[1]))
    ws = word_tokenize(df.iloc[i].values[1])
    if 'http' in ws:
        text = ' '.join(ws[0:ws.index('http')])
    elif 'https' in ws:
        text = ' '.join(ws[0:ws.index('https')])
    else:
        text = ' '.join(ws)
    temp = word_tokenize(text)
    temp2 = []
    for i in range(len(temp)):
      if temp[i][0] != "@":
        temp2.append(temp[i])
    text = ' '.join(temp2)
    for pop in pop_list:
        text = text.replace(pop, '')
    text = text.split('åÊ')
    text = ' '.join(text)
    text = [item.lower() for item in text]
    text = ''.join(text)
    clean_tweets.append(text)
df['Tweet'] = clean_tweets
df['Tweet'] = df['Tweet'].astype(str) 


## Spell Check, Stem content, Spell Check again, and Remove Stop Words

In [0]:
spell = SpellChecker()

clean_tweets = []
for i in range(len(df)): 
    ws = word_tokenize(df.iloc[i].values[1])
    for j in range(len(ws)): 
      ws[j] =spell.correction(ws[j])
    text = ' '.join(ws)
    clean_tweets.append(text)
  
df['Tweet'] = clean_tweets
df['Tweet'] = df['Tweet'].astype(str) 
########
porter_stemmer=PorterStemmer()

clean_tweets = [] 
for i in range(len(df)):
  ws = [porter_stemmer.stem(word) for word in word_tokenize(df.iloc[i].values[1])]
  text = ' '.join(ws)
  clean_tweets.append(text)

df['Tweet'] = clean_tweets
df['Tweet'] = df['Tweet'].astype(str)

clean_tweets = []
for i in range(len(df)): 
    ws = word_tokenize(df.iloc[i].values[1])
    for j in range(len(ws)): 
      ws[j] =spell.correction(ws[j])
    text = ' '.join(ws)
    clean_tweets.append(text)
  
df['Tweet'] = clean_tweets
df['Tweet'] = df['Tweet'].astype(str) 

##################

clean_tweets = []
for i in range(len(df)):
    ws = [w for w in word_tokenize(df.iloc[i].values[1]) if w not in stopwords.words('english')]
    text = ' '.join(ws)
    clean_tweets.append(text)
 
df['Tweet'] = clean_tweets
df['Tweet'] = df['Tweet'].astype(str) 

# Code for Pretrained Model

## Converting data into Tensor format

In [0]:
df_new = df[:749]
df_new = df_new.sample(frac=1).reset_index(drop=True)

df_test = df_new[:99]
df_train = df_new[99:]

train_data = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(df_train['Tweet'].values, tf.string),
            tf.cast(df_train['Label'].values, tf.int32)
        )
    )
)
test_data = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(df_test['Tweet'].values, tf.string),
            tf.cast(df_test['Label'].values, tf.int32)
        )
    )
)

# Build Pre-Trained Model and set the parameters, optimization, and loss schemes.

In [0]:
train_examples_batch, train_labels_batch = next(iter(train_data.batch(10)))
#train_examples_batch

embedding = "https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1"
hub_layer = hub.KerasLayer(embedding, input_shape=[], 
                           dtype=tf.string, trainable=True)
#hub_layer(train_examples_batch)

model = tf.keras.Sequential()
model.add(hub_layer)
model.add(tf.keras.layers.Dense(32,activation = 'relu'))
model.add(tf.keras.layers.Dropout(.01))
model.add(tf.keras.layers.Dense(16, activation = 'relu'))
model.add(tf.keras.layers.Dense(1))
#model.summary()

model.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])

## Fit Model

In [0]:

history = model.fit(train_data.shuffle(1000).batch(512), validation_data=test_data.batch(512), epochs = 20, verbose = 1)

## Display Results of Model with Test Set

In [0]:
results = model.evaluate(test_data.batch(50), verbose = 2)

for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name,value))

In [0]:
temp = df_test.to_numpy()
np.sum(temp[:,0])/len(df_test)

# Code for In-House Model

## Encode Text into Numerical Format

In [0]:
label_encoder = LabelEncoder()
new_test = ' '.join(clean_tweets)
vocab = set(np.array(list(word_tokenize(new_test))))
vocab = np.array(list(vocab))
values = ' '.join(vocab)
values = np.array(values)
integer_encoded = label_encoder.fit_transform(vocab)
word_dict = {}
for i in range(len(vocab)):
  word_dict[vocab[i]] = integer_encoded[i]
max_length = 0
for i in range(len(clean_tweets)):
  max_length = max(max_length, len(word_tokenize(clean_tweets[i])))
numb_tweets = np.zeros((len(clean_tweets), max_length))
print(max_length,numb_tweets.shape)
for i in range(len(clean_tweets)):
  temp = word_tokenize(clean_tweets[i])
  for j in range(len(temp)-1):
    numb_tweets[i,j] = word_dict[temp[j]]
labels = df['Label'].to_numpy()
df_numpy = np.column_stack((labels, numb_tweets))
#ones_test = np.ones((99,1))
#ones_train = np.ones((639,1))
#df_new = df[:738]
#df_new = df_new.sample(frac=1).reset_index(drop=True)
#nt_new = numb_tweets[:738,:]

df_new = df_numpy[:738,:]
np.random.shuffle(df_new)
df_test = df_new[:99,:]
df_train = df_new[99:,:]
#nt_test = nt_new[:99,:]
#nt_train = nt_new[99:,:]


feat_train = df_train[:,1:].astype(int)
label_train = df_train[:,0].astype(int)

feat_test = df_test[:,1:].astype(int)
label_test = df_test[:,0].astype(int)

train_data = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(feat_train, tf.int64),
            tf.cast(label_train, tf.int64)
        )
    )
)
test_data = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(feat_test, tf.int64),
            tf.cast(label_test, tf.int64)
        )
    )
)

## Build Sequential Model and set parameters, optimization and loss schemes.


In [0]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(len(vocab)+1, 50, input_length=max_length))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(len(vocab)+1,activation = 'relu'))
model.add(tf.keras.layers.Dropout(.1))
model.add(tf.keras.layers.Dense(500,activation = 'relu'))
model.add(tf.keras.layers.Dropout(.05))
model.add(tf.keras.layers.Dense(100,activation = 'relu'))
model.add(tf.keras.layers.Dropout(.01))
model.add(tf.keras.layers.Dense(32,activation = 'relu'))
#model.add(tf.keras.layers.Dropout(.01))
model.add(tf.keras.layers.Dense(16,activation = 'relu'))
#model.add(tf.keras.layers.Dropout(.005))
model.add(tf.keras.layers.Dense(1))
#keras.optimizers.SGD(learning_rate=0.01, momentum=0.0, nesterov=False)
#model.compile(optimizer = keras.optimizers.SGD(learning_rate=0.01, momentum=0.0, nesterov=False), loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])
model.compile(optimizer = 'adam', loss = tf.keras.losses.BinaryCrossentropy(from_logits=True), metrics=['accuracy'])



## Fit Model

In [0]:
history = model.fit(train_data.shuffle(1000).batch(256), validation_data=test_data.batch(256), epochs = 20, verbose = 1)

## Display Results of Model with Test Set

In [0]:
results = model.evaluate(test_data.padded_batch(256), verbose = 2)
for name, value in zip(model.metrics_names, results):
  print("%s: %.3f" % (name,value))

In [0]:
np.sum(label_test)/len(label_test)