In [1]:
import sys
import argparse
import os
import json
import numpy as np
import html
import re
import time
import spacy
import string
import csv
import pandas as pd

In [2]:
indir = '/Users/jasonzhou/Downloads/Hack_data'
abbv_dir_1 = '/Users/jasonzhou/Downloads/NLP_origin/A1/Wordlists/abbrev.english'
abbv_dir_2 = '/Users/jasonzhou/Downloads/NLP_origin/A1/Wordlists/pn_abbrev.english'
clitics_dir = '/Users/jasonzhou/Downloads/NLP_origin/A1/Wordlists/clitics'
stop_dir = '/Users/jasonzhou/Downloads/NLP_origin/A1/Wordlists/StopWords'

In [3]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [4]:
def preproc1( comment , steps=range(1,11)):
    ''' This function pre-processes a single comment

    Parameters:
        comment : string, the body of a comment
        steps   : list of ints, each entry in this list corresponds to a preprocessing step

    Returns:
        modComm : string, the modified comment
    '''
    # print(comment)
    modComm = ''
    if 1 in steps:
        #Remove all newline characters
        # comment = comment.replace("\n"," ")
        comment = ' '.join([line.strip() for line in comment.strip().splitlines()])
    if 2 in steps:
        #Replace HTML character codes (i.e., &...;) with their ASCII equivalent
        comment = html.unescape(comment)
    if 3 in steps:
        #Remove all URLs (i.e., tokens beginning with http or www)
        comment = re.sub(r'http\S+', '', comment, flags=re.MULTILINE)
        comment = re.sub(r'www\S+', '', comment, flags=re.MULTILINE)
        comment = re.sub(r'@\S+','',comment, flags=re.MULTILINE)
        # comment = re.sub(r'(?:(?:www|http|https):\/\/)?([-a-zA-Z0-9.]{2,256}\.[a-z]{2,4})\b(?:\/[-a-zA-Z0-9@:%_\+.~#?&//=]*)?',"",comment,flags=re.MULTILINE)
        # reference: https://stackoverflow.com/questions/38804425/remove-urls-from-a-text-file
    if 4 in steps:
        abbrev_1 = open(abbv_dir_1).read().split('\n')
        abbrev_2 = open(abbv_dir_2).read().split('\n')
        abbreviation = abbrev_1 + abbrev_2
        if '' in abbreviation:
            abbreviation.remove('')
        # punctuation = "!\"#$%&()*+,\\-./:;<=>?@[]^_{|}~"

        temp = ''
        i = 0
        while i < len(comment):
            isException = False
            if i == 0 or not(comment[i-1].isalpha()):
                for word in abbreviation:
                    if comment[i:].lower().startswith(word.lower()):
                        temp += comment[i:i+len(word)]
                        i += len(word)
                        isException = True
                        break
            if not(isException):
                if comment[i] in string.punctuation and comment[i] != "'":
                    temp += ' '
                    while i < len(comment) and (comment[i] in string.punctuation and comment[i] != "'"):
                        temp += comment[i]
                        i += 1
                    temp += ' '
                else:
                    temp += comment[i]
                    i += 1

        comment = temp.replace('  ', ' ')

    if 5 in steps:
        clitics = open(clitics_dir).read().split('\n')
        # clitics.append("'t")
        clitics.append("s'")
        for word in clitics:
            if word == "s'":
                comment = comment.replace(word,"s '")
            else:
                comment = comment.replace(word," "+word)
        # print(comment)

    if 6 in steps:
        # print(comment)
        stop_words = open(stop_dir).read().split('\n')
        stop_words = list(filter(('').__ne__, stop_words))
        comment = comment.split(" ")
        comment = list(filter(('').__ne__, comment))
        temp = ''
        # print(comment)
        for word in comment:
            if word.lower() not in stop_words:
                temp += word + ' '
        # print(temp)
        comment = temp

    if 7 in steps:
        utt = nlp(comment)
        temp = ''
        for token in utt:
            if token.lemma_.startswith('-'):
                temp += token.text
            else:
                temp += token.lemma_
            temp += ' '
        # print(temp)
        comment = temp

    if 8 in steps:
        comment = comment.lower()

    if 9 in steps:
        comment = comment.split(" ")
        comment = list(filter(('').__ne__, comment))
        temp = ''
        for word in comment:
            if word.isalpha():
                temp += word + ' '
        comment = temp

    modComm = comment
    return modComm

In [6]:
def buildVolcab(data):
    totalVolcab = []
    for tweet in data:
        volcab = {}
        sent = tweet.split(' ')
        for word in sent:
            if word not in volcab.keys():
                volcab[word] = 1
            else:
                volcab[word] += 1
        totalVolcab.append(volcab)
    return totalVolcab

In [7]:
def buildwordList(totalVolcab):
    wordList = []
    for volcab in totalVolcab:
        for word in volcab.keys():
            if word not in wordList:
                wordList.append(word)
    return wordList

In [8]:
def buildMatrix(wordList, totalVolcab):
    matrix = np.zeros((len(totalVolcab),len(wordList)))
    for i, volcab in enumerate(totalVolcab):
        for j, key_word in enumerate(wordList):
            if key_word in volcab.keys():
                matrix[i][j] = volcab[key_word]
    return matrix

In [8]:
#main
for subdir, dirs, files in os.walk(indir):
    for file in files:
        fullFile = os.path.join(subdir, file)
        print( "Processing " + fullFile)

        scores, sents = [], []
        if not file.startswith('.DS_Store'):
            df = pd.read_csv(fullFile,sep='delimiter',header=None)
            indices = np.arange(len(df))
            np.random.shuffle(indices)
            tweet = df.iloc[0,0].split(',')[5]
            j = 0
            for i in range(100000):
                j += 1
                if j >= 10000:
                    print(i)
                    j = 0
                index = indices[i]
                tweet = df.iloc[index,0].split(',')[5]
                tweet = tweet.lstrip('\"')
                tweet = tweet.rstrip('\"')

                sent = preproc1(tweet,range(1,10))
                sents.append(sent)

                score = df.iloc[index,0].split(',')[0]
                score = score.lstrip('\"')
                score = score.rstrip('\"')
                score = int(score) - 2
                scores.append(score)

    print(len(scores))
    print(len(sents))
    print(len(scores) == len(sents))

Processing /Users/jasonzhou/Downloads/Hack_data/.DS_Store
Processing /Users/jasonzhou/Downloads/Hack_data/training_1600000_processed_noemoticon.csv


  # Remove the CWD from sys.path while we load stuff.


9999
19999
29999
39999
49999
59999
69999
79999
89999
99999
100000
100000
True


In [9]:
totalVolcab = buildVolcab(sents)

In [10]:
wordList = buildwordList(totalVolcab)

In [11]:
matrix = buildMatrix(wordList, totalVolcab)

In [12]:
print(matrix.shape)
scoreMatrix = np.array(scores).reshape((len(scores),1))
print(scoreMatrix.shape)
fullMatrix = np.concatenate((matrix,scoreMatrix),axis=1)
print(fullMatrix.shape)

(100000, 41186)
(100000, 1)
(100000, 41187)


In [9]:
def getlookupDict(fullMatrix, wordList):
    lookupDict = {}
    for i, word in enumerate(wordList):
        if word not in lookupDict.keys():
            word_scores = (fullMatrix[:,i].T.dot(fullMatrix[:,-1]) / sum(fullMatrix[:,i]))
            lookupDict[word] = word_scores
    return lookupDict

In [14]:
lookupDict = getlookupDict(fullMatrix, wordList)

In [10]:
def getScore(sentence, lookupDict):
    sum_score = 0
    sent = preproc1(sentence,range(1,10))
    sent_list = sent.split(' ')
    for word in sent_list:
        if word in lookupDict.keys():
            sum_score += lookupDict[word]
            
    return sum_score / len(sent_list)

In [None]:
# print(fullMatrix[:,-1])

In [16]:
test_sentence = "I am so thrilled to see you and wish we made an awesome team at the hackathon"
print("score: ", getScore(test_sentence,lookupDict))

score:  0.4367535884618149


In [17]:
test_sentence = "If I had pulled my wife and daughter closer, they wouldn't fall into the crack"
print("score: ", getScore(test_sentence,lookupDict))

score:  -0.2560452650625948


In [18]:
test_sentence = "Well, the ground is shaking, vertically and horizontally"
print("score: ", getScore(test_sentence,lookupDict))

score:  0.22569599999999998


In [19]:
test_sentence = "Did you see any signs of where the crack would be before it happened"
print("score: ", getScore(test_sentence,lookupDict))

score:  -0.22440914922701424


In [20]:
test_sentence = "Not really. It was too chaotic for me to notice that"
print("score: ", getScore(test_sentence,lookupDict))

score:  -0.4549147663551402


In [21]:
test_sentence = "Do you think your family would expect that from you"
print("score: ", getScore(test_sentence,lookupDict))

score:  0.03991694659635183


In [22]:
test_sentence = "True. You know what, I am feeling much better now"
print("score: ", getScore(test_sentence,lookupDict))

score:  -0.22947730464359686


In [23]:
test_sentence = "At the time of the assult, I felt frightened, I felt angry, confused, roughly I don't know what was happening to me"
print("score: ", getScore(test_sentence,lookupDict))

score:  -0.4894487019807366


In [24]:
test_sentence = "I'm emotional, devastated and I'm distressed, because I can't find anywhere to stay"
print("score: ", getScore(test_sentence,lookupDict))

score:  -0.5873432192122001


In [27]:
fout = open('lookup_dict.json','w')
fout.write(json.dumps(lookupDict))
fout.close()

In [8]:
def getScore_display(sentence, lookupDict):  
    sum_score = 0
    sent = preproc1(sentence,range(1,10))
    sent_list = sent.split(' ')
    sent_list = list(filter(('').__ne__, sent_list))
    word_score = {}
    for word in sent_list:
        if word in lookupDict.keys():
            sum_score += lookupDict[word]
            if word not in word_score.keys():
                word_score[word] = lookupDict[word]
    #print(sorted(word_score.items()))
    count = 0
    for key, value in sorted(word_score.items()):
        if value <= -0.5:
            count += 1
            if count <= 2:
                print("%s: %.3f" %(key, value))
            else:
                break
    return sum_score / len(sent_list)

In [6]:
local_lookupDict = json.load(open('lookup_dict.json'))

In [80]:
test_sentence = "I am emotional, devastated and I am distressed, because I cannot find anywhere to stay"
print("score: ", getScore_display(test_sentence,lookupDict,local=True))

devastate: -2.000
emotional: -0.800
score:  -0.7205170291820508


In [81]:
test_sentence = "At the time of the assult, I felt frightened, I felt angry, confused, roughly I don't know what was happening to me"
print("score: ", getScore_display(test_sentence,lookupDict,local=True))

angry: -1.404
confused: -1.125
score:  -0.5342604021608035


In [88]:
test_sentence = "I am so thrilled to see you and wish we can make an awesome team at the hackathon"
print("score: ", getScore_display(test_sentence,lookupDict,local=True))

[('awesome', 1.1904761904761905), ('hackathon', 2.0), ('team', 0.4383561643835616), ('thrilled', 0.8), ('wish', -1.1528150134048258)]
wish: -1.153
score:  0.6552034682909852


In [89]:
test_sentence = "I am so thrilled to see you and we will make an awesome team at the hackathon"
print("score: ", getScore_display(test_sentence,lookupDict,local=True))

[('awesome', 1.1904761904761905), ('hackathon', 2.0), ('team', 0.4383561643835616), ('thrilled', 0.8)]
score:  1.107208088714938


In [22]:
test_sentence = "really bad"
print("score: ", getScore_display(test_sentence,local_lookupDict))

[('bad', -1.2114285714285715)]
bad: -1.211
score:  -1.2114285714285715


In [26]:
test_sentence = "Terrible."
print("score: ", getScore_display(test_sentence,local_lookupDict))

[('terrible', -1.4901960784313726)]
terrible: -1.490
score:  -1.4901960784313726


In [27]:
test_sentence = "The nightmares kept bothering me."
print("score: ", getScore_display(test_sentence,local_lookupDict))

[('bother', -1.044776119402985), ('nightmare', -1.368421052631579)]
bother: -1.045
nightmare: -1.368
score:  -1.2065985860172819


In [29]:
test_sentence = "My wife. If I had pulled her closer, she would not fall into the crack."
print("score: ", getScore_display(test_sentence,local_lookupDict))

[('close', -0.8494983277591973), ('crack', -0.058823529411764705), ('fall', -0.8636363636363636), ('pull', -0.49122807017543857), ('wife', 0.09345794392523364)]
close: -0.849
fall: -0.864
score:  -0.4339456694115061


In [30]:
test_sentence = "I am a terrible husband."
print("score: ", getScore_display(test_sentence,local_lookupDict))

[('husband', -0.32653061224489793), ('terrible', -1.4901960784313726)]
terrible: -1.490
score:  -0.9083633453381352
