## Setup

In [1]:
import stanza
import datetime as dt
import pandas as pd
import bisect
from gensim import models,corpora,utils
from collections import defaultdict

In [7]:
corenlp_dir = 'C:\\Users\\Neal_McBeal\\Documents\\cs410project\\corenlp'

# Set the CORENLP_HOME environment variable to point to the installation location
import os
os.environ["CORENLP_HOME"] = corenlp_dir

# Import client module
from stanza.server import CoreNLPClient

In [8]:
# threads = pd.read_csv('threads.csv', dtype = {'tweet_id': str})
mask = (threads["length"] > 2) &  threads["verify_alternance"] & threads["inbound_first"] & threads["verify_thread"] & threads["verify_time"]  
threads_ok = threads[mask].copy()
threads_ok.set_index('tweet_id', inplace = True)

In [9]:
threads_ok.head()

Unnamed: 0_level_0,author_id,company_name,tweet_l,author_l,inbound_l,time_l,length,verify_thread,verify_time,verify_alternance,inbound_first
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2,115712,sprintcare,8|6|5|4|3|1|2,115712|sprintcare|115712|sprintcare|115712|spr...,True|False|True|False|True|False|True,2017-10-31 21:45:10+00:00|2017-10-31 21:46:24+...,7,True,True,True,True
11,sprintcare,sprintcare,18|17|16|15|12|11,115713|sprintcare|115713|sprintcare|115713|spr...,True|False|True|False|True|False,2017-10-31 19:56:01+00:00|2017-10-31 19:59:13+...,6,True,True,True,True
27,Ask_Spectrum,Ask_Spectrum,29|28|24|21|22|25|26|27,115716|Ask_Spectrum|115716|Ask_Spectrum|115716...,True|False|True|False|True|False|True|False,2017-10-31 22:01:35+00:00|2017-10-31 22:05:37+...,8,True,True,True,True
23,115716,Ask_Spectrum,29|28|24|21|23,115716|Ask_Spectrum|115716|Ask_Spectrum|115716,True|False|True|False|True,2017-10-31 22:01:35+00:00|2017-10-31 22:05:37+...,5,True,True,True,True
37,VerizonSupport,VerizonSupport,36|34|35|37,115719|VerizonSupport|115719|VerizonSupport,True|False|True|False,2017-10-31 22:10:46+00:00|2017-10-31 22:13:33+...,4,True,True,True,True


In [2]:
full_df = pd.read_csv("emojiTranslatedCleanedNoUnderscore.csv", na_filter= False, parse_dates = ['created_at'],
                      dtype = {'tweet_id': str,'in_response_to_tweet_id': str, 'inbound':bool, 'response_tweet_id':str })
full_df.set_index("tweet_id", inplace = True)

In [3]:
full_df.head(15)

Unnamed: 0_level_0,Unnamed: 0,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,sprintcare,False,2017-10-31 22:10:47+00:00,I understand. I would like to assist you. We ...,2.0,3.0
2,1,115712,True,2017-10-31 22:11:45+00:00,@sprintcare and how do you propose we do that,,1.0
3,2,115712,True,2017-10-31 22:08:27+00:00,@sprintcare I have sent several private messag...,1.0,4.0
4,3,sprintcare,False,2017-10-31 21:54:49+00:00,Please send us a Private Message so that we c...,3.0,5.0
5,4,115712,True,2017-10-31 21:49:35+00:00,@sprintcare I did.,4.0,6.0
6,5,sprintcare,False,2017-10-31 21:46:24+00:00,"Can you please send us a private message, so ...",57.0,8.0
8,6,115712,True,2017-10-31 21:45:10+00:00,@sprintcare is the worst customer service,9610.0,
11,7,sprintcare,False,2017-10-31 22:10:35+00:00,This is saddening to hear. Please shoot us a ...,,12.0
12,8,115713,True,2017-10-31 22:04:47+00:00,@sprintcare You gonna magically change your co...,111314.0,15.0
15,9,sprintcare,False,2017-10-31 20:03:31+00:00,We understand your concerns and we'd like for...,12.0,16.0


## Sentiment Analysis

**Warning! This will take around 9 hours to run. Output has been written to con_sent.csv. Recommended to skip this section and go to "Observing Deltas" instead**

In [12]:
sentiment_pairs_df = pd.DataFrame(columns=['first_tweet_id','second_tweet_id','third_tweet_id','company','sentiment_change'])

In [32]:
senti_value_order = ["Very negative", "Negative", "Neutral", "Positive", "Very positive"]
def sentiment_score(sent_string):
    if len(sent_string) > 0:
        return senti_value_order.index(sent_string)
    return 2

def sentiment_average(sent_list):
    if len(sent_list) == 0:
        return 0
    total = 0
    for sent in sent_list:
        total += sentiment_score(sent)
    return total / len(sent_list)

# sentiment_pairs = []
company_col, tweetl_col, authorl_col, inboundl_col = \
    threads_ok.columns.get_indexer(['company_name','tweet_l','author_l','inbound_l'])
with CoreNLPClient(annotators=['sentiment'], 
                   memory='6G', endpoint='http://localhost:9002', be_quiet=True, timeout = 200000) as client:
    print("Processing", len(threads_ok) ,"number of valid threads...")
    for row in range(119232, len(threads_ok)):
        if row % 1000 == 0:
            print("Working on thread number: ", row, "time: ", dt.datetime.now().time())
        company, raw_tweetl, raw_authorl, raw_inboundl = \
            threads_ok.iloc[row,[company_col, tweetl_col, authorl_col, inboundl_col]]
        tweet_list = raw_tweetl.split('|')
        author_list = raw_authorl.split('|')
        inbound_list = raw_inboundl.split('|')
        
        # process first tweet standalone
        tweet_idx3 = None
        tweet_idx1 = tweet_list[0]
        tweet1_text = full_df.loc[tweet_idx1, 'text']
        doc1 = client.annotate(tweet1_text)
        sent_list = []
        if len(tweet1_text) > 0:
            for sentence in doc1.sentence:
                sent_list.append(sentence.sentiment)

        doc1_sentiment = 2 if len(sent_list) == 0 else sentiment_average(sent_list)
        
        # process all consecutive pairs of customer tweets in the thread
        for i in range(1,len(tweet_list)-1,2):
            if not tweet_idx3 == None:
                tweet_idx1 = tweet_idx3
                tweet1_text = tweet3_text
                doc1_sentiment = doc3_sentiment
            
            tweet_idx2, tweet_idx3 = tweet_list[i:i+2]
            tweet2_text = full_df.loc[tweet_idx2, 'text']
            tweet3_text = full_df.loc[tweet_idx3, 'text']
            
            doc3 = client.annotate(tweet3_text)
            
            sent_list = []
            if len(tweet3_text) > 0:
                for sentence in doc3.sentence:
                    sent_list.append(sentence.sentiment)
                
            doc3_sentiment = 2 if len(sent_list) == 0 else sentiment_average(sent_list)
            
            delta = doc3_sentiment - doc1_sentiment
            sentiment_pairs.append([tweet_idx1, tweet_idx2, tweet_idx3, company, delta])
            

2020-11-30 01:52:56 INFO: Writing properties to tmp file: corenlp_server-d355efdad98d4661.props
2020-11-30 01:52:56 INFO: Starting server with command: java -Xmx6G -cp C:\Users\Neal_McBeal\Documents\cs410project\corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9002 -timeout 200000 -threads 5 -maxCharLength 100000 -quiet True -serverProperties corenlp_server-d355efdad98d4661.props -annotators sentiment -preload -outputFormat serialized


Processing 122609 number of valid threads...
Working on thread number:  120000 time:  01:57:34.741243
Working on thread number:  121000 time:  02:00:16.186114
Working on thread number:  122000 time:  02:03:00.530691


In [36]:
sentiment_pairs_df = pd.DataFrame(data=sentiment_pairs, columns=['first_tweet_id','second_tweet_id','third_tweet_id','company','sentiment_change'])

In [37]:
sentiment_pairs_df.to_csv("con_sent.csv")

In [29]:
for sp in sentiment_pairs:
    print(sp)
    if sp[4] >= 2.5:
        print(full_df.loc[sp[0], 'text'])
        print(full_df.loc[sp[1], 'text'])
        print(full_df.loc[sp[2], 'text'])

['8', '6', '5', 'sprintcare', 1.0]
['5', '4', '3', 'sprintcare', -1.0]
['3', '1', '2', 'sprintcare', 1.0]
['18', '17', '16', 'sprintcare', 1.3333333333333333]
['16', '15', '12', 'sprintcare', -0.5]
['29', '28', '24', 'Ask_Spectrum', 1.0]
['24', '21', '22', 'Ask_Spectrum', -1.0]
['22', '25', '26', 'Ask_Spectrum', 1.0]
['29', '28', '24', 'Ask_Spectrum', 1.0]
['24', '21', '23', 'Ask_Spectrum', -1.0]
['36', '34', '35', 'VerizonSupport', 3.0]
somebody from @VerizonSupport please help meeeeee  weary face  weary face  weary face  weary face  I'm having the worst luck with your customer service
 Help has arrived! We are sorry to see that you are having trouble. How can we help?
@VerizonSupport I finally got someone that helped me, thanks!
['59', '58', '57', 'VerizonSupport', 0.0]
['57', '56', '55', 'VerizonSupport', 1.0]
['55', '54', '53', 'VerizonSupport', 0.0]
['53', '52', '51', 'VerizonSupport', -0.5]
['66', '64', '65', 'ChipotleTweets', 1.0]
['76', '75', '74', 'ChipotleTweets', 0.5]
['163'

## Observing Deltas

In [4]:
sentiment_pairs_df = pd.read_csv("con_sent.csv")
sentiment_pairs_df.set_index('Unnamed: 0', inplace = True)
sentiment_pairs_df.head()

Unnamed: 0_level_0,first_tweet_id,second_tweet_id,third_tweet_id,company,sentiment_change
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,8,6,5,sprintcare,1.0
1,5,4,3,sprintcare,-1.0
2,3,1,2,sprintcare,1.0
3,18,17,16,sprintcare,1.333333
4,16,15,12,sprintcare,-0.5


In [55]:
num_rows = sentiment_pairs_df.shape[0]
def lower_sent_for_top_p_responses(percent=.001):
    num_top = round(num_rows * percent)
    deltas = []
    
    for row in range(num_rows):  
        change = sentiment_pairs_df.iloc[row][4]
        bisect.insort(deltas, change)
        
        if len(deltas) > num_top:
            deltas.pop(0)
            
    return deltas        

In [56]:
top001 = lower_sent_for_top_p_responses()
top001

[2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.0,
 2.1666666666666665,
 2.166666666666667,
 2.2,
 2.2,
 2.2,
 2.2,
 2.25,
 2.25,
 2.25,
 2.333333333333333,
 2.333333333333333,
 2.333333333333333,
 2.333333333333333,
 2.333333333333333,
 2.333333333333333,
 2.333333333333333,
 2.333333333333333,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.5,
 2.666666666666667,
 2

In [7]:
num_rows = sentiment_pairs_df.shape[0]
def get_rows_above_delta2():
    row_ids = []
    
    for row in range(num_rows):
        row_data = sentiment_pairs_df.iloc[row]
        change = row_data[4]
        if change >= 2:
            row_ids.append(row_data)
        
    return row_ids

def get_rows_at_delta3():
    row_ids = []
    
    for row in range(num_rows):
        row_data = sentiment_pairs_df.iloc[row]
        change = row_data[4]
        if change == 3:
            row_ids.append(row_data)
        
    return row_ids

def get_rows_below_deltaneg2():
    row_ids = []
    
    for row in range(num_rows):
        row_data = sentiment_pairs_df.iloc[row]
        change = row_data[4]
        if change <= -2:
            row_ids.append(row_data)
        
    return row_ids

def get_rows_at_deltaneg3():
    row_ids = []
    
    for row in range(num_rows):
        row_data = sentiment_pairs_df.iloc[row]
        change = row_data[4]
        if change == -3:
            row_ids.append(row_data)
        
    return row_ids

In [8]:
good_rows = get_rows_above_delta2()
best_rows = get_rows_at_delta3()
bad_rows = get_rows_below_deltaneg2()
worst_rows = get_rows_at_deltaneg3()

In [9]:
separator = '----------------------------------------'
def print_best_examples(num_to_print=5):
    print(separator)
    for i in range(num_to_print):
        row = best_rows[i]
        first_text = full_df.loc[str(row[0])]['text']
        second_text = full_df.loc[str(row[1])]['text']
        third_text = full_df.loc[str(row[2])]['text']
        
        print("customer:", first_text)
        print("response:", second_text)
        print("customer:", third_text)
        print(separator)
        
def print_worst_examples(num_to_print=5):
    print(separator)
    for i in range(num_to_print):
        row = worst_rows[i]
        first_text = full_df.loc[str(row[0])]['text']
        second_text = full_df.loc[str(row[1])]['text']
        third_text = full_df.loc[str(row[2])]['text']
        
        print("customer:", first_text)
        print("response:", second_text)
        print("customer:", third_text)
        print(separator)

In [10]:
print_best_examples()
print_worst_examples()

----------------------------------------
customer: somebody from @VerizonSupport please help meeeeee  weary face  weary face  weary face  weary face  I'm having the worst luck with your customer service
response:  Help has arrived! We are sorry to see that you are having trouble. How can we help?
customer: @VerizonSupport I finally got someone that helped me, thanks!
----------------------------------------
customer: @AppleSupport Thanks, thing is I still have like 81 cents in credit and won't let me do that until I have zero credit
response:  Try contacting our iTunes Store team here for more help: 
customer: @AppleSupport Awesome, thanks
----------------------------------------
customer: @Uber_Support Thanks - our baby is &lt;12mos and we need our car seat to fly so I was wondering if we can bring our own car seat and install via belt buckle in any UberX to go to the airport
response:  Hi there! Yes, you're always welcome to bring your own car seat along for the ride.
customer: @Uber

In [11]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

In [12]:
stopwords = open('stopwords.txt',"r")
stoplist = stopwords.read().splitlines() 

def get_texts(rows):
    texts = []
    for row in rows:
        cs_tweet_id = row[1]
        cs_tweet_text = full_df.loc[str(cs_tweet_id)]['text']
        cs_tweet_text = [word for word in cs_tweet_text.lower().split() if word not in stoplist]
        texts.append(cs_tweet_text)
        
    return texts
    
goodTweetList = list(sent_to_words(get_texts(best_rows)))
badTweetList = list(sent_to_words(get_texts(worst_rows)))
gTweetList = list(sent_to_words(get_texts(good_rows)))
bTweetList = list(sent_to_words(get_texts(bad_rows)))

In [13]:
goodFrequency = defaultdict(int)
goodTotal = 0
for text in goodTweetList:
    for token in text:
        goodFrequency[token] += 1
        goodTotal += 1

badFrequency = defaultdict(int)
badTotal = 0
for text in badTweetList:
    for token in text:
        badFrequency[token] += 1
        badTotal += 1
        
gFrequency = defaultdict(int)
gTotal = 0
for text in gTweetList:
    for token in text:
        gFrequency[token] += 1
        gTotal += 1
        
bFrequency = defaultdict(int)
bTotal = 0
for text in bTweetList:
    for token in text:
        bFrequency[token] += 1
        bTotal += 1

In [14]:
def top_words(frequency, total, top):
    measures = []
    
    for word in frequency:
        count = frequency[word]
        if len(measures) < top or count > measures[0]:
            bisect.insort(measures, count)
            if len(measures) > top:
                measures.pop(0)
    ret = []
    for word in frequency:
        count = frequency[word]
        if count >= measures[0]:
            ret.append([word, count*100.0/total])
            
    return ret

In [15]:
top_words(goodFrequency, goodTotal, top=10)

[['sorry', 2.5641025641025643],
 ['hi', 2.7777777777777777],
 ['service', 1.9230769230769231],
 ['please', 1.9230769230769231],
 ['dm', 1.9230769230769231],
 ['lapse', 1.2820512820512822],
 ['general', 1.2820512820512822],
 ['nature', 1.2820512820512822],
 ['concern', 1.2820512820512822],
 ['clarissa', 1.2820512820512822]]

In [16]:
top_words(gFrequency, gTotal, top=11)

[['help', 1.3339657319025315],
 ['sorry', 1.5859370368174541],
 ['look', 0.7025552854686666],
 ['you', 1.093851900160076],
 ['dm', 1.1323886879705936],
 ['please', 2.410031422303907],
 ['like', 0.744056441572301],
 ['hi', 1.5385071441275864],
 ['here', 1.0108495879528072],
 ['let', 0.8804173830556709],
 ['know', 0.7974150708484022]]

In [17]:
top_words(badFrequency, badTotal, top=11)

[['please', 3.061224489795918],
 ['we', 1.530612244897959],
 ['that', 2.0408163265306123],
 ['purchase', 1.530612244897959],
 ['steam', 3.061224489795918],
 ['key', 1.530612244897959],
 ['let', 2.0408163265306123],
 ['know', 2.0408163265306123],
 ['concern', 1.530612244897959],
 ['could', 1.530612244897959],
 ['ok', 1.530612244897959],
 ['sent', 1.530612244897959],
 ['signals', 1.530612244897959],
 ['box', 1.530612244897959],
 ['helps', 1.530612244897959],
 ['allan', 1.530612244897959],
 ['best', 1.530612244897959]]

In [18]:
top_words(bFrequency, bTotal, top=11)

[['let', 0.840945338552994],
 ['know', 0.7636170315596152],
 ['help', 1.367744429945387],
 ['you', 1.2517519694553187],
 ['please', 2.237687883620898],
 ['dm', 0.9521047798559761],
 ['there', 0.8216132618046493],
 ['here', 0.9376057222947175],
 ['hi', 1.681890677105988],
 ['sorry', 1.6915567154801605],
 ['thank', 0.6862887245662366]]