# 1. Data collection - web scraper

In [1]:
import urllib.request
import re
from bs4 import BeautifulSoup
import nltk
from stanfordcorenlp import StanfordCoreNLP
import numpy as np

#web request
def get_web(keyword):
    '''
    input: quora keywords
    output：str, webpage
    '''
    response = urllib.request.urlopen('https://www.quora.com/'+keyword)
    response = response.read().decode('utf-8')
    return response

#write to txt
def write_to_txt(obj,keywords):
    '''
    input: file names, quora keywords
    output：txt
    '''
    result = ''
    for keyword in keywords:
        response = get_web(keyword)
        html = BeautifulSoup(response, 'html.parser')
        response = html.select('.ui_qtext_expanded')
        string = '\n'+'\n' + str(response)
        result += string

    with open(str(obj)+'.txt','a',encoding='utf-8') as f:
        f.write(result)
        
def write_to_text(name, obj):
    '''
    input: file names, quora keywords
    output：txt
    '''
    with open(str(name)+'.txt','a',encoding='utf-8') as f:
        f.write(obj)  
        
#read txt
def read_txt(obj):
    '''
    input：txt文件名
    output：str文档
    '''
    with open(str(obj)+'.txt','r',encoding='utf-8') as f:
        obj = f.read()
    return obj

#data cleaning
def clean_corpus(content):
    '''
    input：带有非法字符、标点的str评论文本str
    output：清洗干净的只保留-连字符的纯评论文本str
    '''
    content = re.sub('<(.*?)>','',content)
    content = re.sub('#(.*?);','',content)
    content = re.sub('(\n)','',content)
    #content = re.sub('(n&#039;)','_no',content)
    content = re.sub('(\\U0001f60b)','',content)
    content = re.sub('(\xa0)','',content)
    content = re.sub('(\u200b)','',content)
    content = re.sub('http://(.*?)/','',content)
    content = re.sub('[<>#&:;/"\(\)\—…\[\]|•_]', '',content)
    content = re.sub('(\'){2,}', '',content)
    content = re.sub('(http)','',content)
    content = re.sub('www\.(.*?)\.com','',content)
    content = re.sub('(www.)','',content)
    content = re.sub('(.com)','',content)
    content = re.sub('( - )','',content)
    content = re.sub('(--)','',content)    
    content = re.sub('( – )','',content)  
    content = re.sub('(\.){2,}','',content)
    content = re.sub('(\. , )','.',content)
    content = re.sub('(etc)','',content)
    content = re.sub('\.','. ',content)
    content = re.sub('\?','? ',content)
    content = re.sub(' {2,}',' ',content)  
    content = re.sub('alexa','Alexa',content)
    content = re.sub('ALEXA','Alexa',content )
    content = re.sub('siri','Siri',content )
    content = re.sub('SIRI','Siri',content )
    content = re.sub('cortana','Cortana',content )
    content = re.sub('CORTANA','Cortana',content )
    content = re.sub('google','Google',content )
    content = re.sub('GOOGLE','Google',content )
    return content

In [None]:
#quora keyword lists
alexa = ['What-are-the-best-Amazon-Alexa-Skills-to-enable',\
         'Is-Alexa-from-Amazon-useful',\
         'What-is-your-review-of-Amazon-Alexa-1',\
         'Do-you-use-Alexa-Is-it-useful'
           ]
siri = ['Why-is-Siri-important',\
        'What-does-Siri-do-well',\
        'Why-is-Siri-so-unintelligent',\
        'Is-Siri-an-example-of-artificial-intelligence',\
           ]

cortana = ['Is-Cortana-better-than-Siri',\
           'What-can-Cortana-do-that-Siri-cant',\
           'What-is-your-review-of-Cortana',\
           'Is-cortana-a-big-failure',\
           ]

google = ['Is-Google-smarter-than-Siri',\
          'How-smart-is-Google-Assistant',\
          'What-are-the-disadvantages-of-Google-Assistant',\
          'Do-you-think-tha-Google-now-assistant-have-surpassed-Siri'
           ]
mix = ['Which-is-a-better-assistant-Cortana-or-Siri-or-Google-Now',\
       'Which-is-better-Siri-or-Google-Assistant-1', \
       'Is-Amazon-Echo-and-or-Siri-and-other-voice-assistants-actually-useful-or-is-it-just-a-novelty-Are-usage-and-retention-of-these-products-growing',\
       'Who-is-more-useful-Alexa-or-Siri',\
       'Which-voice-helper-is-the-best-Alexa-Siri-or-Google-Assistant',\
       'Which-one-s-better-Google-Home-or-Alexa',\
       'Which-one-do-you-think-is-best-Alexa-Amazon-Siri-Apple-or-Cortana-Microsoft',\
       'Which-voice-helper-is-the-best-Alexa-Siri-or-Google-Assistant',\
       'Which-is-a-better-assistant-Cortana-or-Siri-or-Google-Now'
           ]

alexa = clean_corpus(alexa)
siri = clean_corpus(siri)
cortana = clean_corpus(cortana)
google = clean_corpus(google)
mix = clean_corpus(mix)

#for i in [alexa,siri,cortana,google,mix]:
    #write_to_text(str(i),i)
write_to_text('alexa',alexa)
write_to_text('siri',siri)
write_to_text('cortana',cortana)
write_to_text('google',google)
write_to_text('mix',mix)

# 2. Text classification

In [2]:
#Punkt sentence segmenter (Kiss & Strunk, 2006)
def segment(doc):
    assert isinstance(doc, str)
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sents = sent_tokenizer.tokenize(doc)
    return sents

#### 1. Sort sentences in Mix

In [None]:
import pprint
    
mix_alexa = []
for s in seg_mix:
    if 'alexa' in s.lower():
        mix_alexa.append(s)
    elif 'amazon' in s.lower():
        mix_alexa.append(s)
    elif ' echo ' in s.lower():
        mix_alexa.append(s)
        
mix_siri = []
for s in seg_mix:
    if 'siri' in s.lower():
        mix_siri.append(s)
    elif 'apple' in s.lower():
        mix_siri.append(s)
        
mix_cortana = []
for s in seg_mix:
    if 'cortana' in s.lower():
        mix_cortana.append(s)
    elif 'microsoft' in s.lower():
        mix_cortana.append(s)

mix_google = []
for s in seg_mix:
    if 'google' in s.lower():
        mix_google.append(s)

print(len(mix_alexa),len(mix_siri),len(mix_cortana),len(mix_google),len(seg_mix))

In [None]:
seg_alexa = segment(alexa)
seg_siri = segment(siri) 
seg_cortana = segment(cortana) 
seg_google = segment(google) 

In [None]:
print(len(seg_alexa),len(seg_siri),len(seg_cortana),len(seg_google))
# 891 898 233 169

#### 2. Move sentences in the four classified corpora to right corpus

In [None]:
for s in seg_alexa:
    if 'google' in s.lower():
        mix_google.append(s)
        seg_alexa.remove(s)
    elif 'google-assistant' in s.lower():
        mix_google.append(s)
        seg_alexa.remove(s)
    elif 'cortana' in s.lower():
        mix_cortana.append(s)
        seg_alexa.remove(s)
    elif 'microsoft' in s.lower():
        mix_cortana.append(s)
        seg_alexa.remove(s)
    elif 'siri' in s.lower():
        mix_siri.append(s)
        seg_alexa.remove(s)
    elif 'apple' in s.lower():
        mix_siri.append(s)
        seg_alexa.remove(s)
    else:
        continue
        
for s in seg_siri:
    if 'google' in s.lower():
        mix_google.append(s)
        seg_siri.remove(s)
    elif 'google-assistant' in s.lower():
        mix_google.append(s)
        seg_siri.remove(s)
    elif 'cortana' in s.lower():
        mix_cortana.append(s)
        seg_siri.remove(s)
    elif 'microsoft' in s.lower():
        mix_cortana.append(s)
        seg_siri.remove(s)
    elif 'alexa' in s.lower():
        mix_alexa.append(s)
        seg_siri.remove(s)
    elif 'amazon' in s.lower():
        mix_alexa.append(s)
        seg_siri.remove(s)
    elif 'echo' in s.lower():
        mix_alexa.append(s)
        seg_siri.remove(s)
    else:
        continue

for s in seg_cortana:
    if 'google' in s.lower():
        mix_google.append(s)
        seg_cortana.remove(s)
    elif 'google-assistant' in s.lower():
        mix_google.append(s)
        seg_cortana.remove(s)
    elif 'siri' in s.lower():
        mix_siri.append(s)
        seg_cortana.remove(s)
    elif 'apple' in s.lower():
        mix_siri.append(s)
        seg_cortana.remove(s)
    elif 'alexa' in s.lower():
        mix_alexa.append(s)
        seg_cortana.remove(s)
    elif 'amazon' in s.lower():
        mix_alexa.append(s)
        seg_cortana.remove(s)
    elif 'echo' in s.lower():
        mix_alexa.append(s)
        seg_cortana.remove(s)
    else:
        continue
        
for s in seg_google:
    if 'alexa' in s.lower():
        mix_alexa.append(s)
        seg_google.remove(s)
    elif 'amazon' in s.lower():
        mix_alexa.append(s)
        seg_google.remove(s)
    elif 'echo' in s.lower():
        mix_alexa.append(s)
        seg_google.remove(s)
    elif 'siri' in s.lower():
        mix_siri.append(s)
        seg_google.remove(s)
    elif 'apple' in s.lower():
        mix_siri.append(s)
        seg_google.remove(s)
    elif 'cortana' in s.lower():
        mix_cortana.append(s)
        seg_google.remove(s)
    elif 'microsoft' in s.lower():
        mix_cortana.append(s)
        seg_google.remove(s)
    else:
        continue

In [None]:
mix_alexa = seg_alexa + mix_alexa
mix_siri = seg_siri + mix_siri
mix_cortana = seg_cortana + mix_cortana
mix_google = seg_google + mix_google

#### 3. Move sentences in Mix to corresponding corpus

In [None]:
for s in mix_alexa:
    if 'google' in s.lower():
        mix_alexa.remove(s)
    elif 'google-assistant' in s.lower():
        mix_alexa.remove(s)
    elif 'cortana' in s.lower():
        mix_alexa.remove(s)
    elif 'microsoft' in s.lower():
        mix_alexa.remove(s)
    elif 'siri' in s.lower():
        mix_alexa.remove(s)
    elif 'apple' in s.lower():
        mix_alexa.remove(s)
    else:
        pass
        
for s in mix_siri:
    if 'google' in s.lower():
        mix_siri.remove(s)
    elif 'google-assistant' in s.lower():
        mix_siri.remove(s)
    elif 'cortana' in s.lower():
        mix_siri.remove(s)
    elif 'microsoft' in s.lower():
        mix_siri.remove(s)
    elif 'alexa' in s.lower():
        mix_siri.remove(s)
    elif 'amazon' in s.lower():
        mix_siri.remove(s)
    elif 'echo' in s.lower():
        mix_siri.remove(s)
    else:
        pass

for s in mix_cortana:
    if 'google' in s.lower():
        mix_cortana.remove(s)
    elif 'google-assistant' in s.lower():
        mix_cortana.remove(s)
    elif 'siri' in s.lower():
        mix_cortana.remove(s)
    elif 'apple' in s.lower():
        mix_cortana.remove(s)
    elif 'alexa' in s.lower():
        mix_cortana.remove(s)
    elif 'amazon' in s.lower():
        mix_cortana.remove(s)
    elif 'echo' in s.lower():
        mix_cortana.remove(s)
    else:
        pass
        
for s in mix_google:
    if 'cortana' in s.lower():
        mix_google.remove(s)
    elif 'microsoft' in s.lower():
        mix_google.remove(s)
    elif 'siri' in s.lower():
        mix_google.remove(s)
    elif 'apple' in s.lower():
        mix_google.remove(s)
    elif 'alexa' in s.lower():
        mix_google.remove(s)
    elif 'amazon' in s.lower():
        mix_google.remove(s)
    elif 'echo' in s.lower():
        mix_google.remove(s)
    else:
        pass

In [None]:
print(len(seg_alexa),len(mix_alexa))
print(len(seg_siri),len(mix_siri))
print(len(seg_cortana),len(mix_cortana))
print(len(seg_google),len(mix_google))

'''
885 1232
891 1086
218 258
152 464
'''

In [None]:
text_alexa = ' '.join(mix_alexa)
text_siri = ' '.join(mix_siri)
text_cortana = ' '.join(mix_cortana)
text_google = ' '.join(mix_google)

write_to_text('clean_alexa',text_alexa)
write_to_text('clean_siri',text_siri)
write_to_text('clean_cortana',text_cortana)
write_to_text('clean_google',text_google)

In [3]:
alexa = read_txt('alexa')
siri = read_txt('siri')
cortana = read_txt('cortana')
google = read_txt('google')
mix = read_txt('mix')

In [6]:
seg_alexa = segment(alexa)
seg_siri = segment(siri) 
seg_cortana = segment(cortana) 
seg_google = segment(google) 
seg_mix = segment(mix)
seg_text = seg_alexa + seg_siri + seg_cortana + seg_google +seg_mix
print(len(seg_alexa),len(seg_siri),len(seg_cortana),len(seg_google),len(seg_mix),len(seg_text))

907 939 340 210 2674 5070
