In [None]:
# Imports all needed libraries

import json
import csv
import gzip

import re
import texthero as hero
from texthero import preprocessing
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import matplotlib.pyplot as plt
import scipy
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Flatten, Dropout, PReLU, GRU, Bidirectional
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
import pickle

print("Version: ", tf.__version__)
print("Eager mode: ", tf.executing_eagerly())
print("Keras version:", keras.__version__)

# Creation and Training/Testing of Model

In [None]:
# Imports local training/testing dataset
# Clears unwanted columns

df = pd.read_csv(input('input training/testing data'), encoding = "UTF-8")

df = df.drop(['Num', 'Date', 'Query', 'User_ID'], axis=1)

In [None]:
display(df)

In [None]:
# Shows graph of data divided between positive and negatives sentiments

val_count = df.Sentiment.value_counts()
plt.figure(figsize=(8,4))
plt.bar(val_count.index, val_count.values)
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.axis(['Negative', 'Positive', 0, 200000])
plt.title("Sentiment Data Distribution")

plt.savefig('training_data_chart.jpeg')

In [None]:
# Replace username with USERNAME

def pre_cleaning(text):
    line=re.sub(r"\B@\w+", "", text) 
    return line

# Uses TextHero package to:
# 1) Remove non-assigned values
# 2) Lowercase all text
# 3) Remove digits
# 4) Remove punctuation
# 5) Remove stopwords
# 6) Remove whitespace
# 7) Remove urls

custom_pipeline = [preprocessing.fillna,
                   preprocessing.lowercase,
                   preprocessing.remove_digits,
                   preprocessing.remove_punctuation,
                   preprocessing.remove_stopwords,
                   preprocessing.remove_whitespace,
                   preprocessing.remove_urls,
                  ]

df.Text = df.Text.apply(lambda x: pre_cleaning(x))
df['Text'] = hero.clean(df['Text'], custom_pipeline)
df = df.sample(frac=1)

In [None]:
pd.set_option('display.max_colwidth', -1)
print(df.head(10))

In [None]:
# Uses natural language toolkit lemmatization library to lemmatize data

lemmatizer = WordNetLemmatizer() 

lemmatizer = nltk.stem.WordNetLemmatizer()

# Gets part of speech

def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Lemmatizes

def lemmatize_text(text):
    tokens = [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in text.split()]
    return " ".join(tokens)

df['Text'] = df.Text.apply(lemmatize_text)

print(df.head(5))

In [None]:
# Splits dataset into 80% training data, and 20% testing data

test_size = 0.2
train_data, test_data = train_test_split(df, test_size=test_size, random_state=42, shuffle=True)
print("Test Data size", len(test_data))

In [None]:
# Transforms data into vectors (Tfidf preserves and takes into account frequency of features)

vectorizer = TfidfVectorizer (max_features=7500, min_df=20, max_df=0.3, ngram_range=(1,2))
vectorizer.fit(train_data.Text.to_list())
x_train = vectorizer.transform(train_data.Text.to_list())
x_test = vectorizer.transform(test_data.Text.to_list())

print("Training X Shape:", x_train.shape)
print("Testing X Shape:", x_test.shape)

In [None]:
# Saves vectorizer

pickle.dump(vectorizer, open("vectorizer.pickle", "wb"))

In [None]:
# Train and test labels

Y_train = train_data['Sentiment']
Y_test = test_data['Sentiment']

In [None]:
# Transforms the data type to array form and reshapes it into a tensor array

X_train = scipy.sparse.csr_matrix.toarray(x_train)
X_test = scipy.sparse.csr_matrix.toarray(x_test)

X_train = X_train.reshape(x_train.shape[0], 1, x_train.shape[1])
X_test = X_test.reshape(x_test.shape[0], 1, x_test.shape[1])

In [None]:
# Creates the actual Model using Keras

Model_Final = Sequential()

Model_Final.add(LSTM(32, input_shape=(1, X_train.shape[2]), return_sequences=True))
Model_Final.add(PReLU())
Model_Final.add(Dropout(0.5))

Model_Final.add(LSTM(32))
Model_Final.add(PReLU())
Model_Final.add(Dropout(0.5))

Model_Final.add(Dense(1, activation='sigmoid'))

opt=keras.optimizers.Adam(0.025)

Model_Final.compile(loss='binary_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

Model_Final.summary()

In [None]:
# Training

history = Model_Final.fit(X_train, Y_train, epochs=10, batch_size=256, verbose=1, validation_split=0.2)

In [None]:
# Writes model history to csv

Model_Hist = 'Model_Hist.csv'
            
with open(Model_Hist, 'w', newline='') as file:
    writer = csv.writer(file)
    for i in history.history:
        writer.writerow(history.history[i])

In [None]:
# Graphs model history using matplotlib

acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
plt.figure(figsize=(8, 8))
plt.subplot(2, 1, 1)

plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.ylabel('Accuracy')
plt.ylim([min(plt.ylim()),1])
plt.title('Training and Validation Accuracy')
plt.subplot(2, 1, 2)

plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.ylabel('Cross Entropy')
plt.ylim([0,1.0])
plt.title('Training and Validation Loss')
plt.xlabel('epoch')
plt.show()

In [None]:
# saves model/parameters to file

Model_Final.save("Full_Model")

In [None]:
# testing

Model_Final.evaluate(X_test, Y_test, batch_size=256)

# Use Model with Target Data

In [None]:
# Defines a function to load the Target Tweets (json) file

def load_jsonl(input_path) -> list:
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line.rstrip('\n|\r')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [None]:
# Used an independent twitter hydrator app before this to get fully hydrated tweets from dataset
# This code retrieves the fully hydrated tweets from a local file

file = load_jsonl(input('input target dataset'))

# Creates dataframe from date and text values for each tweet
db_data = []
db_cols = ['Dates', 'Uncleaned_Text']
for f in file:
    Date = f.get('created_at')
    Text = f.get("full_text")
    db_data.append([Date, Text])
df = pd.DataFrame(db_data, columns=db_cols)

In [None]:
# Reloads vectorizer and model

vectorizer = pickle.load(open(input('input vectorizer file path'), 'rb'))
model = keras.models.load_model(input('input model file path'))

In [None]:
# Function to return correct format of dates from csv dates column

def date_parser(text):
    p = re.compile('^[A-Za-z]+\s[A-Za-z]+\s([0-9]+)')
    return str(p.findall(text)[0]) + '/2020'

In [None]:
# Replaces dates column values with correct format

Twitter_Target_df.Dates = Twitter_Target_df.Dates.apply(lambda x: date_parser(x))

In [None]:
# Preprocesses target dataset text in the same way as used before

Twitter_Target_df['Cleaned_Text'] = Twitter_Target_df['Uncleaned_Text']
Twitter_Target_df.Cleaned_Text = Twitter_Target_df.Cleaned_Text.apply(lambda x: pre_cleaning(x))
Twitter_Target_df['Cleaned_Text'] = hero.clean(Twitter_Target_df['Cleaned_Text'], custom_pipeline)

In [None]:
# Lemmatizes the target dataset text

lemmatizer = WordNetLemmatizer()

lemmatizer = nltk.stem.WordNetLemmatizer()

Twitter_Target_df['Cleaned_Text'] = Twitter_Target_df.Cleaned_Text.apply(lemmatize_text)

In [None]:
# Vectorizes the text

Tweets = vectorizer.transform(Twitter_Target_df.Cleaned_Text.to_list())

Tweets = scipy.sparse.csr_matrix.toarray(Tweets)

Tweets = Tweets.reshape(Tweets.shape[0], 1, Tweets.shape[1])

print("Tweets Shape:", Tweets.shape)

In [None]:
# Makes the predictions!

Predictions = Model_Final.predict(Tweets)

Twitter_Target_df['Sentiment_Confidence'] = Predictions

Twitter_Target_df.head(10)

In [None]:
# Sorts the predicted sentiment labels into positive or negative based on sentiment scores. Who even needs list comps :P

Sents = []
for value in Twitter_Target_df['Sentiment_Confidence']:
    if value > 0.5:
        Sents.append('Positive')
    elif value <= 0.5:
        Sents.append('Negative')
        
Twitter_Target_df['Sentiment'] = Sents
Twitter_Target_df.head(10)

In [None]:
Twitter_Target_df.to_csv('input output file name')

# Representation of Results

In [None]:
# Makes a dictionary version of the pandas dataframe

tdict = {}
for i, j in Twitter_Target_df.iterrows(): 
    date = j['Dates']
    sentiment = j['Sentiment']
    if date not in tdict:
        tdict[date] = [sentiment]
    tdict[date].append(sentiment)

In [None]:
# Finds number of tweets per day and displays

def Num_Of_Tweets_Per_Day(Dataset):
    odict = {}
    for k, v in Dataset.items():
        odict[k] = len(v)
    return odict

Nummin = Num_Of_Tweets_Per_Day(tdict)

for date, num in Nummin.items():
    print(str(num) + ' tweets on ' + str(date))  

In [None]:
# Sorts tweets into being either positive or negative sentiment

def Sorter(Dataset):
    odict = {}
    for k, v in Dataset.items():
        Sents = {'pos': 0, 'neg': 0}
        for sent in v:
            if sent == 'Positive':
                Sents['pos'] += 1
            else:
                Sents['neg'] += 1
        percentage = Sents['pos']/(Sents['pos'] + Sents['neg'])
        odict[k] = percentage
    return odict
        
FINAL = {}

In [None]:
Fdict = Sorter(tdict)
FINAL.update(Fdict)

In [None]:
FINAL

In [None]:
plt.bar(FINAL.keys(), FINAL.values())

In [None]:
# Writes daily sentiment percentage scores out to csv

w = csv.writer(open("FINAL.csv", "w"))
for key, val in FINAL.items():
    w.writerow([key, val])

In [None]:
# Rereads previously written dataset

Wanted = {}

with open(input('input file path of wanted csv', 'rt') as f:
    reader = csv.reader(f)
    for k, v in reader:
        Wanted[k] = v

In [None]:
# Sorts into data by month (for better visualization)

def Month_Sorter(Dataset, Month):
    monthly_dict = {}
    for k, v in Dataset.items():
        if Month in k:
            monthly_dict[k] = v
    return monthly_dict

March_Data = Month_Sorter(FINAL, 'Mar')
April_Data = Month_Sorter(FINAL, 'Apr')
May_Data = Month_Sorter(FINAL, 'May')
June_Data = Month_Sorter(FINAL, 'Jun')
July_Data = Month_Sorter(FINAL, 'Jul')
August_Data = Month_Sorter(FINAL, 'Aug')

In [None]:
# Graphs barchart of final daily sentiment percentage scores

plt.bar(FINAL.keys(), FINAL.values())