In [281]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import datetime
import string
from string import digits
import collections
import scipy.stats as scs
import cc_pipeline as P
import time
import random
import pickle
from pprint import pprint
from collections import Counter

#sentiment and language
import nltk
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import vaderSentiment
from langdetect import detect
from gensim.models import Word2Vec
from gensim import corpora
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from spacy import displacy

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.decomposition import PCA, KernelPCA
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF, LatentDirichletAllocation
from gensim.models.ldamodel import LdaModel
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2
import knee_locator

#plotting
from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.models import HoverTool, BoxSelectTool
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from PIL import Image
import pyLDAvis.sklearn
import pyLDAvis.gensim as gensimvis
import pyLDAvis
import umap

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [355]:
def inline_text(show_raw):
    
    '''returns show text without timestamps'''
    
    temp = " ".join( ["\n".join( x.split("\n")[2:] ) for x in show_raw.split("\n\n")] )
    temp = temp.split('\n')
    temp = " ".join(temp)
    return temp

def clean_for_spacy(text_list):
    
    '''cleans all text and creates new column in dataframe'''
    
    doc_list = []
    for doc in text_list:
        doc_list.append(inline_text(doc))
    return doc_list

def clean_for_spacy_lower(text_list):
    
    '''cleans all text and creates new column in dataframe'''
    
    doc_list = []
    for doc in text_list:
        doc_list.append(inline_text(doc).lower())
    return doc_list

def sent_for_spacy(text_list):
    
    '''cleans all text and creates new column in dataframe'''
    
    doc_list = []
    for doc in text_list:
        cleaned = inline_text(doc)
        tok = sent_tokenize(cleaned)
        doc_list.append(tok)
    return doc_list

def lang_detect(doc_series):
    
    lang = []
    for x in doc_series:
        eng = 'en'
        span = 'es'

        try:
            if detect(x) == eng:
                lang.append(eng)
            else:
                lang.append(span)
        except:
            lang.append(None)
            
    return lang

def get_orgs(chunks):
    
    nlp = spacy.load('en_core_web_sm')
    orgs = []
    
    for chunk in chunks:
        document = nlp(chunk)
        labels = set([w.label_ for w in document.ents]) 

        for label in labels: 

            temp_entities = [e for e in document.ents if label==e.label_] 
            temp_entities = list(set(temp_entities)) 

            if label == 'ORG':
                orgs.append(str(temp_entities))
                
    orgs = " ".join(orgs)
    return orgs

def clean_text(doc):
    '''cleans and lemmatizes a string by removing punc, characters, digits, and len(words) < 3'''
    
    stop_words = stopwords.words('english')
    punct = ('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~♪¿’')
    remove_digits = str.maketrans('', '', digits)
    wordnet_lemmatizer = WordNetLemmatizer()
    lemmatized = []
    
    doc = doc.split('\n')
    doc = ' '.join(doc)
    doc = doc.split('-')
    doc = ' '.join(doc)
    doc = doc.split('...')
    doc = ' '.join(doc)
    doc = word_tokenize(doc)

    a = [char for char in doc if char not in punct]
    b = [w for w in a if w not in stop_words] 
    c = [w for w in b if len(w) > 3]
    d = [x for x in c if not (x.isdigit() or x[0] == '-' and x[1:].isdigit())]

    e = ' '.join(d)
    f = e.lower()
    g = f.translate(remove_digits)
    cleaned = str(g)
    doc = word_tokenize(cleaned)
    
    for val in doc:
        doc_temp = wordnet_lemmatizer.lemmatize(val)
        lemmatized.append(doc_temp)
    doc = ' '.join(lemmatized)
    
    return doc

def clean_and_return(docs_list):
    
    docs = []
    for cc in docs_list:
        cleaned_temp = clean_text(cc)
        docs.append(cleaned_temp)
        
    return docs

In [250]:
sent_df = pd.read_csv('data/cc_1000_text.csv', encoding='utf=8')

In [251]:
#english only (for testing)

doc_series = pd.Series(sent_df['text'].values)
language = lang_detect(doc_series)
sent_df['language'] = language
english = sent_df[sent_df['language'] == 'en']

In [375]:
# brand_text = clean_and_return(lines)
# removetable = str.maketrans('', '', "/.")
# brand_text = [s.translate(removetable) for s in brand_text]


In [253]:
temp_text = english['text'].values
sent_text = clean_for_spacy(temp_text)
sent_text_lower = clean_for_spacy_lower(temp_text)

In [310]:
sent_tok = sent_tokenize(sent_text_lower[1])
type(sent_tok)

list

In [374]:
# sent_text_lower[1]

In [293]:
score_text = clean_and_return(sent_text)

In [47]:
#corpus = " ".join(sent_text)

In [315]:
# #break into chunks for spaCy

# n = 100000
# chunks = [corpus[i:i+n] for i in range(0, len(corpus), n)]
# len(chunks)

In [314]:
# org_list = get_orgs(chunks)
# org_list

In [347]:
# with open("brand_list.txt") as f:
#     short_brands = f.read().replace('\n', '').lower()
# print(short_brands)

['accenture', 'adidas', 'adobe', 'agricultural bank of china', 'alibaba', 'amazon', 'american express', 'apple', 'at&t', 'baidu', 'bank of america', 'bank of china', 'bmw', 'budweiser', 'chase', 'china construction bank', 'china life', 'china mobile', 'cisco', 'citi', 'coca-cola', 'colgate', 'commonwealth bank of australia', 'costco', 'deutsche telekom', 'dhl', 'disney', 'ebay', 'exxonmobil', 'facebook', 'fedex', 'ford', 'gillette', 'google', 'gucci', 'hdfc bank', 'hermès', 'honda', 'hp', 'hsbc', 'huawei', 'ibm', 'icbc', 'ikea', 'instagram', 'intel', 'jp morgan', 'jd.com', 'kfc', "l'oréal paris", 'linkedin', 'louis vuitton', "lowe's", 'marlboro', 'mastercard', "mcdonald's", 'mercedes benz', 'microsoft', 'moutai', 'movistar', 'netflix', 'nike', 'oracle', 'pampers', 'paypal', 'pepsi', 'salesforce', 'samsung', 'shell', 'siemens', 'spectrum', 'starbucks', 'subway', 'tencent', 'the home depot', 'toyota', 'uber', 'us bank', 'verizon', 'visa', 'vodafone', 'walmart', 'wells fargo', 'xfinity', 

In [98]:
# brands = pd.read_csv('data/all_brands.csv', encoding='utf-8', header=None)
# brand_names = brands[0].values
# brand_names = list(brand_names)
# lines = (lines + str(brand_names))
# lines = lines.lower()

# #conver string of list into list
# import ast
# x = lines
# x = list(x.replace("'", '').replace('[', '').replace(']', '').split(', '))
# lines = x

# # with open ('data/brands.pkl', 'wb') as f:
# #     pickle.dump(lines, f)
    
# # with open ('data/brands.pkl', 'rb') as r:
# #     brands = pickle.load(r)

In [348]:
# y = short_brands
# y = list(y.replace("'", '').replace('[', '').replace(']', '').replace('\"','').replace('-', '').split(', '))
# short_brands = y
# with open ('data/short_brands_lower.pkl', 'wb') as f:
#     pickle.dump(short_brands, f)

In [269]:
#get sentiment if brand present in text

def get_sentiment_score(doc, brands):
    
    analyser = SentimentIntensityAnalyzer()
    
    temp_doc = doc.split()
    temp = (brand for brand in brands if brand in temp_doc)
    all_brands = []
    scores = []
    if any(temp):
        for brand in temp:
            all_brands.append(brand)
            score = list(dict.items(analyser.polarity_scores(doc)))
            scores.append(score)
    return (all_brands, scores)

In [286]:
test = get_sentiment_score(score_text[1], short_brands)
test

(['santa', 'christmas'],
 [[('neg', 0.048), ('neu', 0.622), ('pos', 0.331), ('compound', 0.9998)],
  [('neg', 0.048), ('neu', 0.622), ('pos', 0.331), ('compound', 0.9998)]])

In [270]:
test = get_sentiment_score(score_text[38], short_brands)
test

(['netflix', 'santa', 'trump', 'obama', 'chocolate', 'army'],
 [[('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)],
  [('neg', 0.085), ('neu', 0.673), ('pos', 0.241), ('compound', 0.9999)]])

In [261]:
# 0=brand,1=scores || 0=neg,1=neu,2=pos || 0=neg/pos, 1=score

test[1][0][1]

0.16

In [288]:
test2 = get_sentiment_score(sent_tok[1], short_brands)
test2

([], [])

In [275]:
sent_array = np.array([sent_text_lower[21]])

In [279]:
type(sent_array)

numpy.ndarray

In [401]:
# SENTENCE get sentiment if brand present in text

def get_sentiment_sentence(sent_tok, brands):
    
    analyser = SentimentIntensityAnalyzer()
    scores = []
    
    for brand in brands:
        for sent in sent_tok:
            if brand in sent:
                score = list(dict.items(analyser.polarity_scores(sent)))
                scores.append([brand, score])
          
    return scores

In [404]:
g = get_sentiment_sentence(low[5], short_brands)
g

[['Allstate', [('neg', 0.0), ('neu', 1.0), ('pos', 0.0), ('compound', 0.0)]],
 ['Allstate',
  [('neg', 0.167), ('neu', 0.833), ('pos', 0.0), ('compound', -0.1027)]],
 ['Medicare',
  [('neg', 0.0), ('neu', 0.543), ('pos', 0.457), ('compound', 0.6369)]],
 ['Medicare',
  [('neg', 0.0), ('neu', 0.787), ('pos', 0.213), ('compound', 0.6597)]],
 ['Medicare',
  [('neg', 0.101), ('neu', 0.504), ('pos', 0.396), ('compound', 0.6249)]]]

In [363]:
#use to create col of sent tokens in df

slic = temp_text[0:10]
low = sent_for_spacy(slic)


In [373]:
with open("brand_list.txt") as f:
    short_brands = f.read().replace('\n', '').lower()
print(short_brands)
y = short_brands
y = list(y.replace("'", '').replace('[', '').replace(']', '').replace('\"','').replace('-', '').split(', '))
short_brands = [x.capitalize() for x in y]
print(short_brands)
with open ('data/short_brands.pkl', 'wb') as f:
    pickle.dump(short_brands, f)

['accenture', 'adidas', 'adobe', 'agricultural bank of china', 'alibaba', 'amazon', 'american express', 'apple', 'at&t', 'baidu', 'bank of america', 'bank of china', 'bmw', 'budweiser', 'chase', 'china construction bank', 'china life', 'china mobile', 'cisco', 'citi', 'coca-cola', 'colgate', 'commonwealth bank of australia', 'costco', 'deutsche telekom', 'dhl', 'disney', 'ebay', 'exxonmobil', 'facebook', 'fedex', 'ford', 'gillette', 'google', 'gucci', 'hdfc bank', 'hermès', 'honda', 'hp', 'hsbc', 'huawei', 'ibm', 'icbc', 'ikea', 'instagram', 'intel', 'jp morgan', 'jd.com', 'kfc', "l'oréal paris", 'linkedin', 'louis vuitton', "lowe's", 'marlboro', 'mastercard', "mcdonald's", 'mercedes benz', 'microsoft', 'moutai', 'movistar', 'netflix', 'nike', 'oracle', 'pampers', 'paypal', 'pepsi', 'salesforce', 'samsung', 'shell', 'siemens', 'spectrum', 'starbucks', 'subway', 'tencent', 'the home depot', 'toyota', 'uber', 'us bank', 'verizon', 'visa', 'vodafone', 'walmart', 'wells fargo', 'xfinity', 