In [None]:
import pandas as pd
from nltk.corpus import stopwords
import spacy
import string
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy as sp
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.naive_bayes import BernoulliNB
from tqdm import tqdm
# Remove punctuations, numbers, space, symbols, organizations
# conda install -c conda-forge spacy
# pip install spacy && python -m spacy download en

#import nltk
#nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")

#### Read in data

In [None]:
# Reading in data, setting encoding and column names
def read_sentiment140(path):
    columns = ["target", "ids", "date", "flag", "user", "text"]
    df = pd.read_csv(path, encoding="ISO-8859-1", names = columns)
    df = df[["target", "text"]]
    # Replacing 4 with 1 in target/sentiment column
    df.target = df.target.replace(4,1)
    return df

# Reading in data, setting encoding and column names
def read_stocks(path):
    columns = ["ticker_symbol", "tweet_id", "writer", "body", "comment_num", 
           "retweet_num", "like_num", "company_name", "sector", "year", "month", "day", "hour"]
    df = pd.read_csv(path, header = 0, engine = 'python', 
            error_bad_lines=False, warn_bad_lines=False)
    df.rename(columns={"body": "text"}, inplace= True)
    # df = df[["target", "text"]]
    return df

#### Data cleaning functions

In [None]:
# Remove urls from tweets like http:// or https://
def remove_urls(text, pattern):
    matches = re.findall(pattern, text)
    for m in matches:
        text = text.replace(m, '')
    return text

def keep_alnum(text):
    '''Keep alphabetical and numerical characters including single quote
    Returns the text in lowercase'''
    return ''.join([l.lower() for l in text if l.isalnum() or l.isspace() or l=="'"])

# Remove '@' and '#' from the beginning of the words
def remove_hashtags(text):
    return ' '.join([w[1:] if w[0] in ['@', '#'] else w for w in text.split()])

# Count the number of negating words like not, none, don't ...
def count_negation(text, negating_w):
    '''Returns 1 if there is a negation in the sentence, otherwise returns zero'''
    count = sum([1 for w in keep_alnum(text).split() if w in negating_w])
    return int(bool(count))

def count_pos(text, pos):
    return sum([1 for w in keep_alnum(text).split() if w in pos])

def count_neg(text, neg):
    return sum([1 for w in keep_alnum(text).split() if w in neg])

def tokenize(text, pos_list, stop_words, symbols):
    # ent_list = ['ORG']
    punctuations = string.punctuation

    doc = nlp(text)

    tokens = [token.lemma_ for token in doc if token.pos_ not in pos_list 
                                           and token.lemma_!= "-PRON-"]
    
    tokens = [token.lower().strip() for token in tokens 
              if token.lower() not in stop_words 
              and token.lower() not in punctuations
              and token.lower() not in symbols]
    
    tokens = ' '.join(tokens)
    return tokens

In [None]:
def clean_df(df_orig):
    
    df = df_orig.copy()
    negating_w = ['no', 'not', 'none', 'nobody', 'nothing', 'neither', 'nor', 'nowhere', 'never', 'hardly',
             'scarcely', 'barely', "doesn't","doesn'", "don't","don'", "isn't","isn'", "wasn't", "wasn'",
             "shouldn't", "shouldn'", "wouldn't", "wouldn'", "couldn't", "couldn'", "won'", "hadn'",
             "won't", "can't", "cannot", "hadn't", "hasn't", "aren't", "didn't", "haven't", "mightn't",
             "needn't", "shan't", "hasn'", "didn'", "haven'", "mightn'", "needn'", "shan'", "aren'"]
    pattern = re.compile('https?://[^\s]+')
    pos = pd.read_csv('opinion_lexicon/positive-words.txt', header=None)[0].tolist()
    neg = pd.read_csv('opinion_lexicon/negative-words.txt', header=None)[0].tolist()
    # https://universaldependencies.org/u/pos/
    pos_list = ['PART','PUNCT', 'NUM', 'CARDINAL NUM', 'SPACE', 'SYM', 'X']
    stop_words = (set(stopwords.words('english')) 
                  - set(['more', 'up', 'down', 'most', 'over', 'above', 'under', 'no', 'not']))
    symbols = ['aapl', 'apple', 'googl', 'google', 'amzn', 'amazon', 'msft', 'microsoft', 'tesla', 'tsla']
    
    def clean(text):
        try:
            return {
                'NEG': count_negation(text, negating_w),
                'count_neg': count_neg(text, neg),
                'count_pos': count_pos(text, pos),
                'text': tokenize(remove_hashtags(remove_urls(text, pattern)), pos_list, stop_words, symbols)
            }
        except:
            print('error processing: ' + text)
            return {
                'NEG': 0,
                'count_neg': 0,
                'count_pos': 0,
                'text': ''
            }
    
    # progress bar
    tqdm.pandas()
    
    clean_df = df.progress_apply(lambda row: clean(row.text), axis='columns', result_type='expand')
    return pd.concat([df.drop(columns='text'), clean_df], axis='columns')


#### TF-IDF Vectorizer

In [None]:
# Create TF-IDF Vectorizer
def create_feature_set(df_train, df_test, max_feat = 10000, test_size=0.1, random_state=6242):
    vectoriser = TfidfVectorizer(ngram_range=(1,2), max_features = max_feat, min_df = 3)
    # Split into training and test set
#     df_train, df_test = train_test_split(df, test_size = test_size, random_state = random_state)

    # Fit vectoriser on train set
    vectoriser.fit(df_train.text)

    # Transform data using vectoriser
    x_train_tfidf = vectoriser.transform(df_train.text)
    x_test_tfidf = vectoriser.transform(df_test.text)
    
    # If the dataset contains the lexicon based features
    if {'NEG', 'count_neg', 'count_pos'}.issubset(set(df_train.columns)):
        
        # Create Sparse Matrix from lexicon features
        x_train_lex = sp.sparse.csr_matrix(df_train[['NEG', 'count_neg', 'count_pos']])
        x_test_lex = sp.sparse.csr_matrix(df_test[['NEG', 'count_neg', 'count_pos']])

        # Combine Tf-Idf and lexicon features
        x_train = sp.sparse.hstack([x_train_tfidf, x_train_lex])
        x_test = sp.sparse.hstack([x_test_tfidf, x_test_lex])
        
    else:
        x_train = x_train_tfidf
        x_test = x_test_tfidf

    y_train = df_train.target.astype(int)
#     y_test = df_test.target.astype(int)
    
    return x_train, x_test, y_train, vectoriser

In [None]:
# def plot_cfm(y_test, y_pred, ax, title):
#     cfm = confusion_matrix(y_test, y_pred)
#     categories  = ['Negative','Positive']
#     group_names = ['True Neg','False Pos', 'False Neg','True Pos']
#     group_percentages = ['{0:.2%}'.format(value) for value in cfm.flatten() / np.sum(cfm)]

#     labels = [f'{v1}\n{v2}' for v1, v2 in zip(group_names,group_percentages)]
#     labels = np.asarray(labels).reshape(2,2)
    
# #     fig, (ax1, ax2) = plt.subplots(1,2, figsize = (10,5), dpi=160)
#     acc = accuracy_score(y_test, y_pred)
    
#     sns.heatmap(cfm, annot = labels, cmap = 'Blues', fmt = '', ax = ax,
#                 xticklabels = categories, yticklabels = categories, cbar = False)
#     ax.set_xlabel("Predicted values", labelpad = 10)
#     ax.set_ylabel("Actual values"   , labelpad = 10)
#     ax.set_title (f"Confusion Matrix - {title}\naccuracy: {acc:.2%}", pad = 20, fontsize=8)

#### Train Models

In [None]:
def train_model(clf, xtrain, ytrain, xtest):
    clf.fit(xtrain, ytrain)
    ypred = clf.predict(xtest)
    return ypred

In [None]:
def get_sentiment(sample = False, sample_size = 1000, save = False):
    # Read in data
    print("Reading in data")
    train_df = read_sentiment140("data/sentiment140.csv")
    test_df = read_stocks("data/final_tweets.csv")

    if sample: 
        # Get sample set
        train_df = train_df.sample(n = sample_size)
        test_df = test_df.sample(n = sample_size)
        
    print("Performing data cleaning")
    # Performing data cleaning
    clean_train = clean_df(train_df)
    clean_test = clean_df(test_df)
    
    print("TF-IDF Vectorizer Training")
    # vectorization and train-test split
    x_train, x_test, y_train, v = create_feature_set(clean_train, clean_test)
    bayes = BernoulliNB(alpha = 0.1)
    
    print("Model Fit and Prediction")
    y_pred = train_model(bayes, x_train, y_train, x_test)
    test_df['sentiment'] = y_pred
    if save:
        test_df.to_csv("test_sentiment.csv")
    return test_df

In [None]:
if __name__ == "__main__":
    get_sentiment(sample = False, save = True)