In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

import warnings
warnings.filterwarnings('ignore')

In [None]:
nltk.download('punkt')

In [None]:
yes = pd.read_csv("yes.csv")
yes = yes["sequence"].to_list()

no = pd.read_csv("no.csv")
no = no["sequence"].to_list()

neither = pd.read_csv("neither.csv")
neither = neither["sequence"].to_list()

In [None]:
# creating train and test partitions for sequences
train_yes = yes[:int(0.85 * len(yes))]
test_yes = yes[int(0.85 * len(yes)):]

train_no = no[:int(0.85 * len(no))]
test_no = no[int(0.85 * len(no)):]

train_neither = neither[:int(0.85 * len(neither))]
test_neither = neither[int(0.85 * len(neither)):]

train_x = train_yes + train_no + train_neither
test_x = test_yes + test_no + test_neither

In [None]:
# creating train and test partitions for labels
t_y = np.full(len(train_yes), 1)
test_y = np.full(len(test_yes), 1)

t_n = np.full((len(train_neither)), 0.5)
test_n = np.full((len(test_neither)), 0.5)

t_no = np.full((len(train_no)), 0)
test_no_ = np.full(len(test_no), 0)

train_y = np.concatenate((t_y, t_no, t_n), axis=0)
test_y = np.concatenate((test_y, test_no_, test_n), axis=0)

In [None]:
# converting to proper shape - ONLY RUN ONCE
#train_y = train_y[..., None] 
# test_y = test_y[..., None]

In [None]:
def process(sequence):
    stemmer = PorterStemmer()
    stopwords_english = stopwords.words('english')
    
    # tokenize sequences
    tokens = word_tokenize(sequence)

    sequences_clean = []
    for word in tokens:
        if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
            stem_word = stemmer.stem(word)  # stemming word
            sequences_clean.append(stem_word)

    return sequences_clean

In [None]:
def build_freqs(sequences, ys):
    """Build frequencies.
    Input:
        sequences: sequences
        ys: an m x 1 array with the sentiment label of each tweet
            (either 0/0.5/1)
    Output:
        freqs: a dictionary mapping each (word, sentiment) pair to its
        frequency
    """
    # Convert np array to list since zip needs an iterable.
    # The squeeze is necessary or the list ends up with one element.
    # Also note that this is just a NOP if ys is already a list.
    yslist = np.squeeze(ys).tolist()

    # Start with an empty dictionary and populate it by looping over all tweets
    # and over all processed words in each tweet.
    freqs = {}
    for y, seq in zip(yslist, sequences):
        for word in process(seq):
            pair = (word, y)
            if pair in freqs:
                freqs[pair] += 1
            else:
                freqs[pair] = 1

    return freqs

In [None]:
# create frequency dictionary
freqs = build_freqs(train_x, train_y)

# check the output
print("type(freqs) = " + str(type(freqs)))
print("len(freqs) = " + str(len(freqs.keys())))

In [None]:
print('This is an example of a positive tweet: \n', train_x[0])
print('\nThis is an example of the processed version of the tweet: \n', process(train_x[0]))

In [None]:
def sigmoid(z): 
    # calculate the sigmoid of z
    h = 1/(1+np.exp(-z))
    return h

In [None]:
def gradientDescent(x, y, theta, alpha, num_iters):
    '''
    Input:
        x: matrix of features which is (m,n+1)
        y: corresponding labels of the input matrix x, dimensions (m,1)
        theta: weight vector of dimension (n+1,1)
        alpha: learning rate
        num_iters: number of iterations you want to train your model for
    Output:
        J: the final cost
        theta: your final weight vector
        '''
    # get 'm', the number of rows in matrix x
    m = len(x)
    
    for i in range(0, num_iters):
        # get z, the dot product of x and theta
        z = np.dot(x,theta)
        # get the sigmoid of z
        h = sigmoid(z)
        # calculate the cost function
        J = float(-(1/m) * (np.dot(np.transpose(y), np.log(h)) +  np.dot(np.transpose(1-y), np.log(1-h))))
        # update the weights theta
        theta = theta - (alpha/m)*(np.dot(np.transpose(x),(h-y)))
        #print("J: ", J)
    J = float(J)
    return J, theta

In [None]:
np.random.seed(1)
# X input is 10 x 3 with ones for the bias terms
tmp_X = np.append(np.ones((10, 1)), np.random.rand(10, 2) * 2000, axis=1)
# Y Labels are 10 x 1
tmp_Y = (np.random.rand(10, 1) > 0.35).astype(float)

# Apply gradient descent
tmp_J, tmp_theta = gradientDescent(tmp_X, tmp_Y, np.zeros((3, 1)), 1e-8, 700)
print(f"The cost after training is {tmp_J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(tmp_theta)]}")

In [None]:
def extract_features(sequence, freqs):
    '''
    Input: 
        tweet: a list of words for one tweet
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
    Output: 
        x: a feature vector of dimension (1,3)
    '''
    # process_tweet tokenizes, stems, and removes stopwords
    word_l = process(sequence)
    
    # 3 elements in the form of a 1 x 3 vector
    x = np.zeros((1, 3)) 
    
    #bias term is set to 1
    x[0,0] = 1 
       
    # loop through each word in the list of words
    for word in word_l:
        # increment the word count for the positive label 1
        if (word, 1) in freqs:
            x[0,1] += freqs[(word, 1)]
        # increment the word count for the negative label 0
        if (word, 0) in freqs:
            x[0,2] += freqs[(word, 0)]
                
    assert(x.shape == (1, 3))
    return x

In [None]:
tmp2 = extract_features('blorb bleeeeb bloooob', freqs)
print(tmp2)

In [None]:
X = np.zeros((len(train_x), 3))
for i in range(len(train_x)):
    X[i, :]= extract_features(train_x[i], freqs)

# training labels corresponding to X
Y = np.array(train_y)

#print("X: ", X)
#print("Y:", Y, type(Y), Y.shape)

# Apply gradient descent
J, theta = gradientDescent(X, Y, np.zeros((3, 1)), 1e-9, 1500)
print(f"The cost after training is {J:.8f}.")
print(f"The resulting vector of weights is {[round(t, 8) for t in np.squeeze(theta)]}")

In [None]:
def predict(sequence, freqs, theta):
    '''
    Input: 
        tweet: a string
        freqs: a dictionary corresponding to the frequencies of each tuple (word, label)
        theta: (3,1) vector of weights
    Output: 
        y_pred: the probability of a tweet being positive or negative
    '''
    
    # extract the features of the tweet and store it into x
    x = extract_features(sequence, freqs)
    
    # make the prediction using x and theta
    y_pred = sigmoid(np.dot(x,theta))
    return y_pred

In [None]:
for seq in ['PHQ-2 Score: 0 Cognition Negative: no evidence of cognitive decline noted by patient or family; no memory problems causing dysfunction in daily activities Falls risk Time to rise from, walk 10 feet,', 
                       'depression, but certainly does not appear depressed on exam - Dementia: MMSE on 5/21/16 23/30 c/w Mild cognitive impairment, which is NOT c/w profound weight loss - Gastroparesis: Hx of diabetes']:
    print( '%s -> %f' % (seq, predict(seq, freqs, theta)))

In [None]:
def test_logistic_regression(test_x, test_y, freqs, theta):
    """
    Input: 
        test_x: a list of tweets
        test_y: (m, 1) vector with the corresponding labels for the list of tweets
        freqs: a dictionary with the frequency of each pair (or tuple)
        theta: weight vector of dimension (3, 1)
    Output: 
        accuracy: (# of tweets classified correctly) / (total # of tweets)
    """
        
    # the list for storing predictions
    y_hat = []
    
    for seq in test_x:
        # get the label prediction for the tweet
        y_pred = predict(seq, freqs, theta)
        
        if y_pred > 0.52:
            y_hat.append(1)
        elif y_pred < 0.52 and y_pred >= 0.5:
            y_hat.append(0.5)
        else:
            y_pred.append(0)
    
    y_hat = np.asarray(y_hat)
    test_y = np.squeeze(test_y)
    # With the above implementation, y_hat is a list, but test_y is (m,1) array
    # convert both to one-dimensional arrays in order to compare them using the '==' operator
    count = 0
    for i in range(len(test_y)):
        if (test_y[i] == y_hat[i]):
            count = count+ 1
        else:
            count

    accuracy = count/(len(test_y))
    return accuracy

In [None]:
tmp_accuracy = test_logistic_regression(test_x, test_y, freqs, theta)
print(f"Logistic regression model's accuracy = {tmp_accuracy:.4f}")