**References**

For inspiration, we consulted the following website which discusses the use of Convolutional Neural Networks using Pytorch to analyse sentiment in Tweets: https://www.kaggle.com/youben/twitter-sentiment-analysis-using-cnn

The Twitter Sentiment Corpus used was accessed here: http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/

The following Python Notebook was used to help with CoreNLP setup: https://colab.research.google.com/github/stanfordnlp/stanza/blob/master/demo/Stanza_CoreNLP_Interface.ipynb

CoreNLP can be found: https://stanfordnlp.github.io/CoreNLP/


# Mounting Google Drive


In [13]:
from google.colab import drive
drive.mount('/content/drive')

# Imports

In [12]:
import pandas as pd
import numpy as np
import re
import string
import datetime
import copy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
# ML Libraries
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Twitter
import tweepy

# Visualisation
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics

# Global Parameters
import nltk
nltk.download("stopwords")
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
nltk.download('wordnet')

# CoreNLP Setup

In [11]:
!wget https://nlp.stanford.edu/software/stanford-corenlp-full-2018-10-05.zip https://nlp.stanford.edu/software/stanford-english-corenlp-2018-10-05-models.jar

In [None]:
# !unzip stanford-corenlp-full-2018-10-05.zip
!mv stanford-english-corenlp-2018-10-05-models.jar /content/drive/MyDrive/Colab\ Notebooks/cmhss/stanford-corenlp-full

In [5]:
# Install stanza; note that the prefix "!" is not needed if you are running in a terminal
!pip install stanza

# Import stanza
import stanza

In [10]:
# Downloading CoreNLP package with Stanza's installation command
corenlp_dir = './corenlp'
stanza.install_corenlp(dir=corenlp_dir)

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

In [9]:
# Examine the CoreNLP installation folder to make sure the installation is successful
!ls $CORENLP_HOME
 # Import client module
from stanza.server import CoreNLPClient

In [8]:
# Construct a CoreNLPClient with some basic annotators, a memory allocation of 4GB, and port number 9001
client = CoreNLPClient(
    annotators=['tokenize','sentiment'], 
    memory='4G', 
    endpoint='http://localhost:9002',
    be_quiet=True)


# Start the background server and wait for some time
# Note that in practice this is totally optional, as by default the server will be started when the first annotation is performed
client.start()
import time; time.sleep(10)

In [7]:
text = "I don't understand british weather. Make up your mind!"
result = client.annotate(text,
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json',
                   })

total_sentiment = 0
for s in result["sentences"]:
    total_sentiment+= int(s["sentimentValue"])
    print("{}: '{}': {} (Sentiment Value) {} (Sentiment)".format(
        s["index"],
        " ".join([t["word"] for t in s["tokens"]]),
        s["sentimentValue"], s["sentiment"]))
print("avg sentiment: ", round(total_sentiment/len(result["sentences"])))

# Functions

## Dataset

In [None]:
def load_dataset(cols):
    dataset = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cmhss/sentiment_dataset.csv', encoding='latin-1') # Enter your file location
    dataset.columns = cols
    return dataset

def filter_columns(dataset, cols):
  for col in cols:
      del dataset[col]
  return dataset

## Processing Text

In [None]:
def remove_url_and_tags(tweet):
    # URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Hashtags and user tags
    tweet = re.sub(r'\@\w+|\#','', tweet)
    return tweet

def preprocess_tweet_text(tweet):
    tweet.lower()
    # URLs
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Hashtags and user tags
    tweet = re.sub(r'\@\w+|\#','', tweet)
    # Punctuation
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]    
    return " ".join(filtered_words)


## TfIdf vectorisation

In [None]:
def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

## CoreNLP Analysis

In [None]:
def corenlp_sentimentality(text):
    result = client.annotate(text,
                   properties={
                       'annotators': 'sentiment, ner',
                       'outputFormat': 'json',
                   })

    total_sentiment = 0
    for s in result["sentences"]:
        total_sentiment+= int(s["sentimentValue"])
        # print("{}: '{}': {} (Sentiment Value) {} (Sentiment)".format(
        #     s["index"],
        #     " ".join([t["word"] for t in s["tokens"]]),
        #     s["sentimentValue"], s["sentiment"]))
    # Return Average Sentiment 
    return round(total_sentiment/len(result["sentences"]))
corenlp_sentimentality("Good weather today! We should go to the park.")

2

# Main body of code

## Data processing

In [4]:
# Load dataset
dataset = load_dataset(['ItemID', 'Sentiment', 'SentimentText'])
# Remove unwanted columns from dataset
n_dataset = filter_columns(dataset, ['ItemID'])
#Preprocess data
dataset.text = dataset['SentimentText'].apply(preprocess_tweet_text)

## Machine Learning

In [6]:
# Tf-Idf vector
tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
tf_vector_test = copy.deepcopy(tf_vector)
print(type(tf_vector))
X = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

# Training Naive Bayes model
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)
y_predict_nb = NB_model.predict(X_test)

# Training Logistics Regression model
LR_model = LogisticRegression(solver='lbfgs')
LR_model.fit(X_train, y_train)
y_predict_lr = LR_model.predict(X_test)


## Twitter

### Authentication

In [None]:
consumerKey = "rcSfK0JXDksoizx0WJsH56rPm"
consumerSecret = "mY229tSJLfnk9AXPlq50KgEijMHVvEVUjuybFQVNuzhC5w7bx3"
accessToken = "1394330176142598145-gpfb70iOBRXcE2DwiUxT3Pp7H04N2p"
accessTokenSecret = "x5TSRfGal9QwVk9bfCJWb5wHoE2nT1h97nWwMEsgnJHnk"
auth = tweepy.OAuthHandler(consumerKey, consumerSecret)
auth.set_access_token(accessToken, accessTokenSecret)
api = tweepy.API(auth)
# test authentication
try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

### Collecting Tweets

In [None]:
Current_Date = datetime.datetime.today().strftime('%Y-%m-%d') 

In [None]:
print(datetime.date.today() - datetime.timedelta(days=0))

In [3]:
df_pos_neg_over_time = pd.DataFrame(columns=['date', 'sentiment', 'percentage'])
df_pos_neg_over_time_corenlp = pd.DataFrame(columns=['date', 'sentiment', 'percentage'])
df_pos_neg_over_time_corenlp_trunc = pd.DataFrame(columns=['date', 'sentiment', 'percentage'])
for i in range(8):

    days_before_today = i

    # Number of Tweets
    count = 10

    # Date formatting
    until_date_formatted = datetime.date.today() - datetime.timedelta(days=days_before_today)
    day_before = until_date_formatted - datetime.timedelta(days=1)

    # API request
    tweets = tweepy.Cursor(api.search, q="london weather", tweet_mode="extended", until = until_date_formatted, lang = "en").items(15)

    # Extracting desired information from each tweet
    date_list = []
    tweets_list = []

    for tweet in tweets:
        # Filtering by date
        if  day_before == tweet.created_at.date(): # here we take only tweets from the same date as yesterday in format YYYY-MM-DD
            # Accessing to full_text is different between RT and normal tweet
            if 'retweeted_status' in tweet._json:  # if it is a RT take full_text
                full_text = tweet._json['retweeted_status']['full_text']
                text_date = str(tweet.created_at.date())
            else:  # if it is a normal tweet take the full_text
                full_text = tweet.full_text
                text_date = str(tweet.created_at.date())
            tweets_list.append(full_text)
            date_list.append(text_date)
        else:
          print("not valid date: ", tweet.created_at.date() )
          print("attributes are ", tweet.full_text)

    df_tweets_corenlp = pd.DataFrame(list(zip(date_list, tweets_list)), columns =['date', 'text'])
    df_tweets_preprocessed = pd.DataFrame(list(zip(date_list, tweets_list)), columns =['date', 'text'])

    # Creating text feature
    df_tweets_preprocessed.text = (df_tweets_preprocessed["text"]).apply(preprocess_tweet_text)
    test_feature = tf_vector_test.transform(np.array(df_tweets_preprocessed.iloc[:, 1]).ravel())

    # Using Logistic Regression model for prediction
    test_prediction_lr = LR_model.predict(test_feature)
    total_tweets = len(test_prediction_lr)
    total_positive_lr = (test_prediction_lr==1).sum()
    total_negative_lr = (test_prediction_lr==0).sum()
    positive_percentage_lr = (total_positive_lr / total_tweets) * 100
    negative_percentage_lr = (total_negative_lr / total_tweets) * 100

    df_pos_neg_over_time.loc[2*i] = [day_before, "positive", positive_percentage_lr]
    df_pos_neg_over_time.loc[2*i+1] = [day_before, "negative", negative_percentage_lr]


    # CoreNLP
    df_tweets_corenlp.text = (df_tweets_corenlp["text"]).apply(remove_url_and_tags)
    tweets_for_corenlp = df_tweets_corenlp['text'].tolist()
    total_score_1 = 0
    total_score_2 = 0
    total_score_3 = 0
    total_score_4 = 0
    for processed_tweet in tweets_for_corenlp:
        sentimentality_score = corenlp_sentimentality(processed_tweet)
        if 0 <= sentimentality_score < 1.5:
          total_score_1 += 1
        elif 1.5 <= sentimentality_score < 2.5:
          total_score_2 += 1
        elif 2.5 <=sentimentality_score < 3.5:
          total_score_3 += 1
        elif 3.5 <= sentimentality_score <= 4:
          total_score_4 += 1 
    score1_percentage_corenlp = (total_score_1 / total_tweets) * 100
    score2_percentage_corenlp = (total_score_2 / total_tweets) * 100
    score3_percentage_corenlp = (total_score_3 / total_tweets) * 100
    score4_percentage_corenlp = (total_score_4 / total_tweets) * 100
    positive_percentage_corenlp = ((total_score_3 + total_score_4) / total_tweets) * 100
    negative_percentage_corenlp = ((total_score_1 + total_score_2) / total_tweets) * 100

    df_pos_neg_over_time_corenlp.loc[4*i] = [day_before, "1", score1_percentage_corenlp]
    df_pos_neg_over_time_corenlp.loc[4*i+1] = [day_before, "2", score2_percentage_corenlp]
    df_pos_neg_over_time_corenlp.loc[4*i+2] = [day_before, "3", score3_percentage_corenlp]
    df_pos_neg_over_time_corenlp.loc[4*i+3] = [day_before, "4", score4_percentage_corenlp]

    df_pos_neg_over_time_corenlp_trunc.loc[2*i] = [day_before, "positive", positive_percentage_corenlp]
    df_pos_neg_over_time_corenlp_trunc.loc[2*i+1] = [day_before, "negative", negative_percentage_corenlp]
    
print(df_pos_neg_over_time_corenlp.head)



### Processing Tweets

## Tweet Statistics

In [None]:
# Accuracy of CoreNLP
test_text = n_dataset["SentimentText"].head(500000).tolist()
test_labels = n_dataset["Sentiment"].head(500000).tolist()
predict_labels = list(map(corenlp_sentimentality, test_text))
for i in range(len(predict_labels)):
    if predict_labels[i] < 2.5:
        predict_labels[i] = 0
    elif predict_labels[i] >=2.5:
        predict_labels[i] = 1
predict_labels = np.array(predict_labels)
test_labels = np.array(test_labels)

In [None]:
corenlp_score = accuracy_score(test_labels, predict_labels)
print("Accuracy: %.3f" % corenlp_score) #LR_model.score(X_test, y_test)
corenlp_precision = precision_score(test_labels, predict_labels, average='binary')
print('Precision: %.3f' % corenlp_precision) # Appropriate when minimizing false positives is the focus.
corenlp_recall = recall_score(test_labels, predict_labels, average='binary')
print('Recall: %.3f' % corenlp_recall) # Appropriate when minimizing false negatives is the focus.
corenlp_f1 = f1_score(test_labels, predict_labels, average='binary')
print('F-Measure: %.3f' % corenlp_f1)

plt.figure(figsize=(9,9))
cm = metrics.confusion_matrix(test_labels, predict_labels)
sns.heatmap(cm, annot=True, fmt='.2', linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Stanford CoreNLP'
plt.title(all_sample_title, size = 15);

In [None]:
print("Naive Bayes accuracy: %.3f" % accuracy_score(y_test, y_predict_nb))
print("\nLogistic Regression:")
lr_score = accuracy_score(y_test, y_predict_lr)
print("Accuracy: %.3f" % lr_score) #LR_model.score(X_test, y_test)
lr_precision = precision_score(y_test, y_predict_lr, average='binary')
print('Precision: %.3f' % lr_precision) # Appropriate when minimizing false positives is the focus.
lr_recall = recall_score(y_test, y_predict_lr, average='binary')
print('Recall: %.3f' % lr_recall) # Appropriate when minimizing false negatives is the focus.
lr_f1 = f1_score(y_test, y_predict_lr, average='binary')
print('F-Measure: %.3f' % lr_f1)

plt.figure(figsize=(9,9))
cm = metrics.confusion_matrix(y_test, y_predict_lr)
sns.heatmap(cm, annot=True, fmt='.2%', linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Logistic Regression'
plt.title(all_sample_title, size = 15);

### Bar chart

In [None]:
df_pos_neg_over_time = df_pos_neg_over_time.set_index('date')
# df_pos_neg_over_time.index = pd.to_datetime(df_pos_neg_over_time.index)
df_pos_neg_over_time.set_index('sentiment', append=True)['percentage'].unstack().plot.bar(stacked=True,figsize=(10,8), title="Overall Sentiment as Determined by a Linear Regression Model")

df_pos_neg_over_time_corenlp = df_pos_neg_over_time_corenlp.set_index('date')
# df_pos_neg_over_time.index = pd.to_datetime(df_pos_neg_over_time.index)
df_pos_neg_over_time_corenlp.set_index('sentiment', append=True)['percentage'].unstack().plot.bar(stacked=True,figsize=(10,8), title="Overall Sentiment as Determined by the CoreNLP Library")

df_pos_neg_over_time_corenlp_trunc = df_pos_neg_over_time_corenlp_trunc.set_index('date')
# df_pos_neg_over_time.index = pd.to_datetime(df_pos_neg_over_time.index)
df_pos_neg_over_time_corenlp_trunc.set_index('sentiment', append=True)['percentage'].unstack().plot.bar(stacked=True,figsize=(10,8), title="Overall Sentiment as Determined by the CoreNLP Library")
