In [408]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import datetime
import string
from string import digits
import collections
import scipy.stats as scs
import cc_pipeline as P
import time
import random
import pickle
from pprint import pprint
from collections import Counter

#sentiment and language
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import vaderSentiment
from langdetect import detect
from gensim.models import Word2Vec
from gensim import corpora
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from spacy import displacy

#machine learning
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models.ldamodel import LdaModel
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2
import knee_locator

#plotting
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import pyLDAvis.sklearn
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import umap

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [355]:
def inline_text(show_raw):
    
    '''returns show text without timestamps'''
    
    temp = " ".join( ["\n".join( x.split("\n")[2:] ) for x in show_raw.split("\n\n")] )
    temp = temp.split('\n')
    temp = " ".join(temp)
    return temp

def clean_for_spacy(text_list):
    
    '''cleans all text and creates new column in dataframe'''
    
    doc_list = []
    for doc in text_list:
        doc_list.append(inline_text(doc))
    return doc_list

def clean_for_spacy_lower(text_list):
    
    '''cleans all text and creates new column in dataframe'''
    
    doc_list = []
    for doc in text_list:
        doc_list.append(inline_text(doc).lower())
    return doc_list

def sent_for_spacy(text_list):
    
    '''cleans all text and creates new column in dataframe'''
    
    doc_list = []
    for doc in text_list:
        cleaned = inline_text(doc)
        tok = sent_tokenize(cleaned)
        doc_list.append(tok)
    return doc_list

def lang_detect(doc_series):
    
    lang = []
    for x in doc_series:
        eng = 'en'
        span = 'es'

        try:
            if detect(x) == eng:
                lang.append(eng)
            else:
                lang.append(span)
        except:
            lang.append(None)
            
    return lang

def get_orgs(chunks):
    
    nlp = spacy.load('en_core_web_sm')
    orgs = []
    
    for chunk in chunks:
        document = nlp(chunk)
        labels = set([w.label_ for w in document.ents]) 

        for label in labels: 

            temp_entities = [e for e in document.ents if label==e.label_] 
            temp_entities = list(set(temp_entities)) 

            if label == 'ORG':
                orgs.append(str(temp_entities))
                
    orgs = " ".join(orgs)
    return orgs

def clean_text(doc):
    '''cleans and lemmatizes a string by removing punc, characters, digits, and len(words) < 3'''
    
    stop_words = stopwords.words('english')
    punct = ('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~♪¿’')
    remove_digits = str.maketrans('', '', digits)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = []
    
    doc = doc.split('\n')
    doc = ' '.join(doc)
    doc = doc.split('-')
    doc = ' '.join(doc)
    doc = doc.split('...')
    doc = ' '.join(doc)
    doc = word_tokenize(doc)

    a = [char for char in doc if char not in punct]
    b = [w for w in a if w not in stop_words] 
    c = [w for w in b if len(w) > 3]
    d = [x for x in c if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]

    e = ' '.join(d)
    f = e.lower()
    g = f.translate(remove_digits)
    cleaned = str(g)
    doc = word_tokenize(cleaned)
    
    for val in doc:
        doc_temp = wordnet_lemmatizer.lemmatize(val)
        lemmatized.append(doc_temp)
    doc = ' '.join(lemmatized)
    
    return doc

def clean_and_return(docs_list):
    
    docs = []
    for cc in docs_list:
        cleaned_temp = clean_text(cc)
        docs.append(cleaned_temp)
        
    return docs

def get_sentiment_score(doc, brands):
    
    analyser = SentimentIntensityAnalyzer()
    
    temp_doc = doc.split()
    temp = (brand for brand in brands if brand in temp_doc)
    all_brands = []
    scores = []
    if any(temp):
        for brand in temp:
            all_brands.append(brand)
            score = list(dict.items(analyser.polarity_scores(doc)))
            scores.append(score)
    return (all_brands, scores)

def get_sentiment_sentence(sent_tok, brands):
    
    analyser = SentimentIntensityAnalyzer()
    scores = []
    
    for brand in brands:
        for sent in sent_tok:
            if brand in sent:
                score = list(dict.items(analyser.polarity_scores(sent)))
                scores.append([brand, score])
          
    return scores

In [250]:
sent_df = pd.read_csv('data/cc_1000_text.csv', encoding='utf=8')

In [251]:
#english only (for testing)

doc_series = pd.Series(sent_df['text'].values)
language = lang_detect(doc_series)
sent_df['language'] = language
english = sent_df[sent_df['language'] == 'en']

In [375]:
# brand_text = clean_and_return(lines)
# removetable = str.maketrans('', '', "/.")
# brand_text = [s.translate(removetable) for s in brand_text]


In [253]:
temp_text = english['text'].values
sent_text = clean_for_spacy(temp_text)
sent_text_lower = clean_for_spacy_lower(temp_text)

In [310]:
sent_tok = sent_tokenize(sent_text_lower[1])
type(sent_tok)

list

In [374]:
# sent_text_lower[1]

In [293]:
score_text = clean_and_return(sent_text)

In [47]:
#corpus = " ".join(sent_text)

In [315]:
# #break into chunks for spaCy

# n = 100000
# chunks = [corpus[i:i+n] for i in range(0, len(corpus), n)]
# len(chunks)

In [314]:
# org_list = get_orgs(chunks)
# org_list

In [98]:
# brands = pd.read_csv('data/all_brands.csv', encoding='utf-8', header=None)
# brand_names = brands[0].values
# brand_names = list(brand_names)
# lines = (lines + str(brand_names))
# lines = lines.lower()

# #conver string of list into list
# import ast
# x = lines
# x = list(x.replace("'", '').replace('[', '').replace(']', '').split(', '))
# lines = x

# # with open ('data/brands.pkl', 'wb') as f:
# #     pickle.dump(lines, f)
    
# # with open ('data/brands.pkl', 'rb') as r:
# #     brands = pickle.load(r)

In [348]:
# y = short_brands
# y = list(y.replace("'", '').replace('[', '').replace(']', '').replace('\"','').replace('-', '').split(', '))
# short_brands = y
# with open ('data/short_brands_lower.pkl', 'wb') as f:
#     pickle.dump(short_brands, f)

In [269]:
#get sentiment if brand present in text



In [286]:
test = get_sentiment_score(score_text[1], short_brands)
test

(['santa', 'christmas'],
 [[('neg', 0.048), ('neu', 0.622), ('pos', 0.331), ('compound', 0.9998)],
  [('neg', 0.048), ('neu', 0.622), ('pos', 0.331), ('compound', 0.9998)]])

In [270]:
test = get_sentiment_score(score_text[38], short_brands)
test

(['netflix', 'santa', 'trump', 'obama', 'chocolate', 'army'],
 [[('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)]])

In [261]:
# 0=brand,1=scores || 0=neg,1=neu,2=pos || 0=neg/pos, 1=score

test[1][0][1]

0.16

In [407]:
test2 = get_sentiment_score(sent_tok[3], short_brands)
test2

[]

In [275]:
sent_array = np.array([sent_text_lower[21]])

In [404]:
g = get_sentiment_sentence(low[5], short_brands)
g

[['Allstate', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Allstate',
  [('neg', 0.167), ('neu', 0.833), ('pos', 0.0), ('compound', -0.1027)]],
 ['Medicare',
  [('neg', 0.0), ('neu', 0.543), ('pos', 0.457), ('compound', 0.6369)]],
 ['Medicare',
  [('neg', 0.0), ('neu', 0.787), ('pos', 0.213), ('compound', 0.6597)]],
 ['Medicare',
  [('neg', 0.101), ('neu', 0.504), ('pos', 0.396), ('compound', 0.6249)]]]

In [363]:
#use to create col of sent tokens in df

slic = temp_text[0:10]
low = sent_for_spacy(slic)


In [405]:
with open("brand_list.txt") as f:
    short_brands = f.read().replace('\n', '').lower()
    
y = short_brands
#use replace instead of str.maketrans for special characters
y = list(y.replace("'", '').replace('[', '').replace(']', '').replace('\"','').replace('-', '').split(', '))
short_brands = [x.capitalize() for x in y]
print(short_brands)

with open ('data/short_brands.pkl', 'wb') as f:
    pickle.dump(short_brands, f)

['Accenture', 'Adidas', 'Adobe', 'Agricultural bank of china', 'Alibaba', 'Amazon', 'American express', 'Apple', 'At&t', 'Baidu', 'Bank of america', 'Bank of china', 'Bmw', 'Budweiser', 'Chase', 'China construction bank', 'China life', 'China mobile', 'Cisco', 'Citi', 'Cocacola', 'Colgate', 'Commonwealth bank of australia', 'Costco', 'Deutsche telekom', 'Dhl', 'Disney', 'Ebay', 'Exxonmobil', 'Facebook', 'Fedex', 'Ford', 'Gillette', 'Google', 'Gucci', 'Hdfc bank', 'Hermès', 'Honda', 'Hp', 'Hsbc', 'Huawei', 'Ibm', 'Icbc', 'Ikea', 'Instagram', 'Intel', 'Jp morgan', 'Jd.com', 'Kfc', 'Loréal paris', 'Linkedin', 'Louis vuitton', 'Lowes', 'Marlboro', 'Mastercard', 'Mcdonalds', 'Mercedes benz', 'Microsoft', 'Moutai', 'Movistar', 'Netflix', 'Nike', 'Oracle', 'Pampers', 'Paypal', 'Pepsi', 'Salesforce', 'Samsung', 'Shell', 'Siemens', 'Spectrum', 'Starbucks', 'Subway', 'Tencent', 'The home depot', 'Toyota', 'Uber', 'Us bank', 'Verizon', 'Visa', 'Vodafone', 'Walmart', 'Wells fargo', 'Xfinity', 'You

In [409]:
with open ('data/sentiments.pkl', 'rb') as r:
     sentiments = pickle.load(r)

In [445]:
type(sentiments)

list

In [444]:
sentiments[0:30]

[['Farmers', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Farmers', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Allstate', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Allstate',
  [('neg', 0.167), ('neu', 0.833), ('pos', 0.0), ('compound', -0.1027)]],
 ['Netflix', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Xfinity', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Xfinity',
  [('neg', 0.0), ('neu', 0.882), ('pos', 0.118), ('compound', 0.4939)]],
 ['Nintendo',
  [('neg', 0.103), ('neu', 0.653), ('pos', 0.243), ('compound', 0.497)]],
 ['Nintendo', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Allstate', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Allstate',
  [('neg', 0.167), ('neu', 0.833), ('pos', 0.0), ('compound', -0.1027)]],
 ['Medicare',
  [('neg', 0.0), ('neu', 0.543), ('pos', 0.457), ('compound', 0.6369)]],
 ['Medicare',
  [('ne

In [440]:
#most polar brands
negative = (sentiments[0], sentiments[0][1][0][1])
negative

(['Farmers', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 0.0)

In [435]:
test = pd.Series(sentiments)


In [447]:
def max_value(sentiments):
    return max([sublist[-1] for sublist in sentiments])

print(max_value(sentiments))

[('neg', 0.831), ('neu', 0.169), ('pos', 0.0), ('compound', -0.5994)]


In [432]:
most_neg = max(test)
most_neg

['Zara', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]]

In [439]:
neg = max(flat_test, key=lambda x: x[1])
neg

TypeError: '>' not supported between instances of 'tuple' and 'str'

In [434]:
pos = max(test, key=lambda x: x[3])
pos

IndexError: list index out of range

In [462]:
df = pd.DataFrame(sentiments, )
df.head(30)

Unnamed: 0,0,1
0,Farmers,"[(neg, 0.0), (neu, 1.0), (pos, 0.0), (compound..."
1,Farmers,"[(neg, 0.0), (neu, 1.0), (pos, 0.0), (compound..."
2,Allstate,"[(neg, 0.0), (neu, 1.0), (pos, 0.0), (compound..."
3,Allstate,"[(neg, 0.167), (neu, 0.833), (pos, 0.0), (comp..."
4,Netflix,"[(neg, 0.0), (neu, 1.0), (pos, 0.0), (compound..."
5,Xfinity,"[(neg, 0.0), (neu, 1.0), (pos, 0.0), (compound..."
6,Xfinity,"[(neg, 0.0), (neu, 0.882), (pos, 0.118), (comp..."
7,Nintendo,"[(neg, 0.103), (neu, 0.653), (pos, 0.243), (co..."
8,Nintendo,"[(neg, 0.0), (neu, 1.0), (pos, 0.0), (compound..."
9,Allstate,"[(neg, 0.0), (neu, 1.0), (pos, 0.0), (compound..."


In [459]:
df[[,]] = pd.DataFrame(df[1].values.tolist(), index=df.index)
df

SyntaxError: invalid syntax (<ipython-input-459-f5dd7548d957>, line 1)

In [464]:
sentiment_array = [(sent[0], sent[1][0][1], sent[1][2][1]) for sent in sentiments]

In [466]:
sent_df = pd.DataFrame(sentiment_array, columns=['label', 'neg','pos'])

In [471]:
positive = sent_df.sort_values('pos', ascending=False)


Unnamed: 0,label,neg,pos
23714,Amazon,0.0,1.0
39181,Amazon,0.0,1.0
16487,Amazon,0.0,1.0
6792,Amazon,0.0,1.0
45957,Amazon,0.0,1.0
31958,Amazon,0.0,1.0
45948,Amazon,0.0,1.0
23710,Amazon,0.0,1.0
6795,Amazon,0.0,1.0
39185,Amazon,0.0,1.0


In [480]:

negative = sent_df.sort_values('neg', ascending=False)
negative

Unnamed: 0,label,neg,pos
74247,Lincoln,0.831,0.000
33392,Kia,0.808,0.000
32382,Kia,0.808,0.000
19973,Kia,0.808,0.000
35444,Kia,0.808,0.000
39907,Kia,0.808,0.000
17412,Kia,0.808,0.000
32883,Kia,0.808,0.000
16448,Kia,0.808,0.000
34818,Kia,0.808,0.000


In [481]:
most_negative = negative['label'].unique()
most_negative

array(['Lincoln', 'Kia', 'Twitter', 'Kanye', 'Apple', 'Kavanaugh', 'Ford',
       'Army', 'Intel', 'Fox', 'Social media', 'Christmas', 'Lowes',
       'Facebook', 'Shell', 'Oracle', 'Adobe', 'Medicare', 'Disney',
       'Spectrum', 'Miller', 'Starbucks', 'Nike', 'Trump', 'Toyota',
       'Navy', 'Santa', 'Uber', 'Discover', 'Amazon', 'Xfinity', 'Dodge',
       'Citi', 'Costco', 'Instagram', 'Chase', 'Allstate', 'Chocolate',
       'Enbrel', 'Netflix', 'Google', 'Mexico', 'Sprint', 'Tresiba',
       'Chevy', 'Microsoft', 'Alexa', 'Liberty mutual', 'Pampers',
       'Applebee', 'Subway', 'Tide', 'Honda', 'Chipotle', 'Marlboro',
       'Downy', 'Stanford', 'Pepsi', 'Samsung', 'Ikea', 'Adidas', 'Gucci',
       'Hyundai', 'Verizon', 'Obama', 'Walmart', 'Nintendo', 'Visa',
       'Nyquil', 'Prudential', 'Nasa', 'Fedex', 'Budweiser', 'Target',
       'Gillette', 'Pillsbury', 'Centrum', 'Vaseline', 'Cisco', 'Paypal',
       'Youtube', 'Geico', 'Colgate', 'Farmers', 'Zara', 'Iphone',
       'He

In [482]:
most_positive = positive['label'].unique()
most_positive

array(['Amazon', 'Lincoln', 'Chipotle', 'Facebook', 'Christmas',
       'Spectrum', 'Chase', 'Shell', 'Medicare', 'Ikea', 'Xfinity',
       'Mexico', 'Disney', 'Ford', 'Navy', 'Intel', 'Alexa', 'Centrum',
       'Dodge', 'Chevy', 'Citi', 'Santa', 'Trump', 'Google', 'Tide',
       'Downy', 'Stanford', 'Fox', 'Pampers', 'Apple', 'Discover',
       'Miller', 'Honda', 'Army', 'Chocolate', 'Kia', 'Adidas', 'Target',
       'Allstate', 'Nintendo', 'Kanye', 'Prudential', 'Vaseline',
       'Starbucks', 'Liberty mutual', 'Budweiser', 'Nasa', 'Hyundai',
       'Twitter', 'Zara', 'Kavanaugh', 'Uber', 'Tresiba', 'Instagram',
       'Costco', 'Walmart', 'Farmers', 'Toyota', 'Pillsbury', 'Gucci',
       'Sprint', 'Visa', 'Verizon', 'Iphone', 'Adobe', 'Samsung',
       'Netflix', 'Nike', 'Obama', 'Cisco', 'Marlboro', 'Geico', 'Subway',
       'Gillette', 'Youtube', 'Microsoft', 'Colgate', 'Applebee',
       'Social media', 'Fedex', 'Pepsi', 'Oracle', 'Lowes', 'Nyquil',
       'Paypal', 'Enbrel', 'Hu