In [4]:
import numpy as np
import nltk
from nltk import word_tokenize 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
stopword = stopwords.words("english")
# Cross validation ratio. The value of k means k*100 percentage of total training set will be treated as training data and the rest training data will be treated as validation data.
k = [0.75,0.80, 0.9]

# if you want to pre-process the text, the value of process_type will `1`, otherwise it will be set to `2`.
process_type =1

# Corpora Sanitization


In [5]:
# Splitting the text into two array based on postive and negative sentiment

def split_text():
    lines = open("PS1.1A_training_data.txt","r").readlines()
    positiveText = []
    negativeText = []
    for line in lines:
        splitLine = line.split("\t")
        if splitLine[2]=="POSITIVE":
            positiveText.append(splitLine[1].lower())
        elif splitLine[2]=="NEGATIVE":
            negativeText.append(splitLine[1].lower())
    total_len = positiveText+negativeText
    print(len(total_len))
    
    return positiveText, negativeText
split_text()


2359


(['this is definitely a must have if your state does not allow cell phone usage while driving.',
  "it's a great place and i highly recommend it.",
  'their steaks are 100% recommended!',
  "before leaving mainland china, i visited one additional historic site, taking an overnight train from beijing to xi'an to see the terra-cotta warriors.",
  'this is simply the best bluetooth headset for sound quality!',
  'love this product.',
  'i remember years ago having a hairdresser who was very talented, very temperamental and wickedly funny.',
  'the entire audience applauded at the conclusion of the film.  ',
  'and i really did find them funny.  ',
  "i'm using it with an iriver spinn (with case) and it fits fine.",
  "over the past 20 years i've made a lot of money, if you look at the cheques.",
  'it was also the right balance of war and love.  ',
  'also makes it easier to hold on to.',
  'i use this product in a motor control center where there is a lot of high voltage humming from the

# Train Test(Validation) Split

In [6]:
'''
    Input:
        k_percent: The percentage of training set will be contained as training data
        segment: This value indicate the position of validation set. If the value of this is 1, then the validation set will be selected from first. If the value goes up, the validation set position within training corpora will continue shifting to the right.
    Output:
        train_x, train_y, test_x, test_y: corpora set of training and validation
'''
def train_validation_split(k_percent, segment=1):
    positiveText, negativeText = split_text()
    k_percent = 1-k_percent

    size_of_negative_corpora = len(negativeText)
    size_of_positive_corpora = len(positiveText)
    
    test_neg = negativeText[int(k_percent*(segment-1)*size_of_negative_corpora):int(k_percent*segment*size_of_negative_corpora)] 
    train_neg = negativeText[0:int(k_percent*(segment-1)*size_of_negative_corpora)] + negativeText[int(k_percent*segment*size_of_negative_corpora):]
    
    
    test_pos = positiveText[int(k_percent*(segment-1)*size_of_positive_corpora):int(k_percent*segment*size_of_positive_corpora)] 
    train_pos = positiveText[0:int(k_percent*(segment-1)*size_of_positive_corpora)] + positiveText[int(k_percent*segment*size_of_positive_corpora):]
    
    print(len(train_pos))

    print(len(train_neg))

    train_x = train_pos + train_neg
    test_x = test_pos + test_neg

    # avoid assumptions about the length of all_positive_sentence
    train_y = np.append(np.ones(len(train_pos)), np.zeros(len(train_neg)))
    test_y = np.append(np.ones(len(test_pos)), np.zeros(len(test_neg)))

    train_x[:5]
#     print(len(train_x), len(train_y))
    
    return train_x, train_y, test_x, test_y


# Text Preprocessing

In [7]:

def convert_lower_case(data):
    return np.char.lower(data)

def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

def lemmatizing(data):
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + lemmatizer.lemmatize(w)
    return new_text
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text


def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) #remove comma seperately
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = lemmatizing(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = remove_punctuation(data) 
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

'''
Input:
    sentence: a sentence 
    process: if 1 then, process the text, otherwise text will not be processed
Output:
    words: array of words that appear in the sentence
'''

def precess_sentence(sentence, process=2):
    
    words = []
    if process==1:
        clean_sentence = preprocess(sentence)
        words =  word_tokenize(clean_sentence)
    else:
#         words =  word_tokenize(sentence)
        words = sentence.split()

    words = [word for word in words]


    return words


In [8]:
precess_sentence("hey, guyz let's go there!!!",2)

['hey,', 'guyz', "let's", 'go', 'there!!!']

# Develop Dictionary 

In [9]:
def count_words(result, texts, ys):
    '''
    Input:
        result: a dictionary that will be used to map each pair to its frequency
        texts: a list of sentences
        ys: a list corresponding to the sentiment of each sentence (either 0 or 1)
    Output:
        result: a dictionary mapping each pair to its frequency
    '''

    for y, text in zip(ys, texts):
        
        for word in precess_sentence(text):
            pair = (word,y)

            if pair in result:
                result[pair] += 1

            else:
                result[pair] = 1

    return result

In [10]:
result = {}
dummy_sentneces = ['i am, happy', 'i am tricked', 'i am sad', 'i am tired', 'i am tired']
ys = [1, 0, 1, 0, 0]
count_words(result, dummy_sentneces, ys)

{('i', 1): 2,
 ('am,', 1): 1,
 ('happy', 1): 1,
 ('i', 0): 3,
 ('am', 0): 3,
 ('tricked', 0): 1,
 ('am', 1): 1,
 ('sad', 1): 1,
 ('tired', 0): 2}

In [11]:
'''
    Input:
        X: feature vector 
        Y: sentiment or target vector
    Output:
        freqs: dictionary containing frequency of words appear in negative or positive sentiment 
    '''
def get_frequency(X, Y):
    # building frequency dictionary
    freqs = count_words({}, X, Y)
    return freqs
# freqs 


# Train Naive Bayes

In [12]:
def train_naive_bayes(train_x, train_y):
    '''
    Input:
        train_x: list of sentences
        train_y: a list of labels 
    Output:
        logprior: the log prior. 
        loglikelihood: the log likelihood of you Naive bayes equation. 
    '''
    freqs = get_frequency(train_x, train_y)
    loglikelihood = {}
    logprior = 0

    # calculate V, the number of unique words in the vocabulary
    vocab = set([pair[0] for pair in freqs.keys()])
    V = len(vocab)

    # calculate N_pos, N_neg, V_pos, V_neg
    N_pos = N_neg = V_pos = V_neg = 0
    for pair in freqs.keys():
        if pair[1] > 0:
            V_pos += 1
            N_pos += freqs[pair]

        else:
            V_neg += 1
            N_neg += freqs[pair]

    D = len(train_y)
    D_pos = (len(list(filter(lambda x: x > 0, train_y))))
    D_neg = (len(list(filter(lambda x: x <= 0, train_y))))
    logprior = np.log(D_pos) - np.log(D_neg)

    for word in vocab:
        freq_pos = freqs.get((word,1),0)
        freq_neg = freqs.get((word,0),0)
        # calculate the probability that each word is positive, and negative using add-1 smoothing
        p_w_pos = (freq_pos + 1) / (N_pos + V)
        p_w_neg = (freq_neg + 1) / (N_neg + V)
        # calculate the log likelihood of the word
        loglikelihood[word] = np.log(p_w_pos/p_w_neg)

    return logprior, loglikelihood

# Naive Bias Predict

In [13]:
def naive_bayes_predict(text, logprior, loglikelihood):
    '''
    Input:
        text: a string
        logprior: a number
        loglikelihood: a dictionary of words mapping to numbers
    Output:
        p: the sum of all the logliklihoods of each word in the text (if found in the dictionary) + logprior (a number)

    '''
    # process the text to get a list of words
    word_l = precess_sentence(text)

    # initialize probability to zero
    p = 0

    # add the logprior
    p += logprior

    for word in word_l:

        # check if the word exists in the loglikelihood dictionary
        if word in loglikelihood:
            # add the log likelihood of that word to the probability
            p += loglikelihood[word]


    return p



In [14]:
# test_sent = 'this is simply the best.'
# p = naive_bayes_predict(test_sent, logprior, loglikelihood)
# print('The output is', p)

# Validation Naive Bayes

In [15]:
def test_naive_bayes(test_x, test_y, logprior, loglikelihood):
    """
    Input:
        test_x: A list of senteces
        test_y: the corresponding labels for the list of sentences
        logprior: the logprior
        loglikelihood: a dictionary with the loglikelihoods for each word
    Output:
        accuracy: (# of sentences classified correctly)/(total # of sentences)
    """
    accuracy = 0  # return this properly
    FP = 0
    FN = 0
    TP = 0
    TN = 0
    y_hats = []
    i=0
    for text in test_x:
        # if the prediction is > 0
        if naive_bayes_predict(text, logprior, loglikelihood) > 0:
            # the predicted class is 1
            y_hat_i = 1
        else:
            # otherwise the predicted class is 0
            y_hat_i = 0

        # append the predicted class to the list y_hats
        y_hats.append(y_hat_i)
        
        # count true positive 
        if y_hat_i ==1 and test_y[i]==1:
            TP=TP+1
        # count true negative 
        if y_hat_i ==0 and test_y[i]==0:
            TN=TN+1
        # count false positive 
        if y_hat_i ==0 and test_y[i]==1:
            FP=FP+1
        # count false negative 
        if y_hat_i ==1 and test_y[i]==0:
            FN=FN+1
        
        i=i+1
        
    # error is the average of the absolute values of the differences between y_hats and test_y
    error = np.mean(np.absolute(y_hats-test_y))

    # Accuracy is 1 minus the error
    accuracy = 1-error
    # Precision 
    precision = TP/(TP+FP)
    # Recall
    recall = TP/(TP+FN)

    return accuracy, precision, recall, TP, TN, FP, FN

# Model selection based on F1 score 

In [16]:
# Model selection based on F1 score generated from different models. These models are altered based on parameters. 
max_accuracy_score = 0 
logprior=0 
loglikelihood=[]
for k_frac in k:
    for i in range(1,int(1/(1-k_frac))+1):
        print("Iteration:",i)
        train_x, train_y, test_x, test_y = train_validation_split(k_frac,i)
        lgprior, lglikelihood = train_naive_bayes(train_x, train_y)
        accuracy, precision, recall, TP, TN, FP, FN = test_naive_bayes(test_x, test_y, lgprior, lglikelihood)
#         F1_score = 2*(precision*recall)/(precision+recall)
#         print(F1_score, lgprior)
        if max_accuracy_score<=accuracy:
            max_accuracy_score = accuracy
            logprior, loglikelihood = lgprior, lglikelihood
    print("--")

print("logprior:",logprior)
# print(loglikelihood)
print("Naive Bayes accuracy = %0.4f, precision= = %0.4f, recall= = %0.4f" %
      (accuracy, precision, recall))

Iteration: 1
2359
808
962
0.7341269841269841 -0.1744523921446115
Iteration: 2
2359
808
961
0.6403508771929824 -0.17341235044919756
Iteration: 3
2359
808
962
0.6926229508196722 -0.1744523921446115
Iteration: 4
2359
807
961
0.6720977596741344 -0.17465074070034348
--
Iteration: 1
2359
862
1026
0.7230769230769231 -0.17416775506702198
Iteration: 2
2359
862
1026
0.6346666666666667 -0.17416775506702198
Iteration: 3
2359
861
1025
0.6997389033942558 -0.17435338714477844
Iteration: 4
2359
862
1026
0.6994818652849741 -0.17416775506702198
Iteration: 5
2359
862
1026
0.6838046272493574 -0.17416775506702198
--
Iteration: 1
2359
970
1154
0.8095238095238095 -0.17369337557061648
Iteration: 2
2359
969
1154
0.648936170212766 -0.17472483517727877
Iteration: 3
2359
969
1154
0.746268656716418 -0.17472483517727877
Iteration: 4
2359
970
1154
0.5955056179775281 -0.17369337557061648
Iteration: 5
2359
969
1154
0.7083333333333334 -0.17472483517727877
Iteration: 6
2359
969
1153
0.7142857142857143 -0.173857908378292

# Best Performed Model on Test Data

In [17]:
 """
   
    Output:
       test_x_hat: test feature vector
       test_y_hat: test target vector
    """
def split_test_data():
    lines = open("PS1.1A_test_data.txt","r").readlines()
    positiveText = []
    negativeText = []
    for line in lines:
        splitLine = line.split("\t")
        if splitLine[2]=="POSITIVE":
            positiveText.append(splitLine[1].lower())
        elif splitLine[2]=="NEGATIVE":
            negativeText.append(splitLine[1].lower())

    print(len(positiveText))

    print(len(negativeText))


    test_negative = negativeText

    test_positive = positiveText

    test_x_hat = test_positive + test_negative
    # avoid assumptions about the length of all_positive_snetence
    test_y_hat = np.append(np.ones(len(test_positive)), np.zeros(len(test_negative)))

    test_y_hat[:5]
    return test_x_hat, test_y_hat
test_x_hat, test_y_hat = split_test_data()

1002
1013


In [18]:
accuracy, precision, recall, TP, TN, FP, FN = test_naive_bayes(test_x_hat, test_y_hat, logprior, loglikelihood)
print("On Test Data Naive Bayes accuracy = %0.4f, precision= = %0.4f, recall= = %0.4f" %
      (accuracy, precision, recall))

On Test Data Naive Bayes accuracy = 0.7285, precision= = 0.6826, recall= = 0.7492
