In [1]:
%matplotlib inline
import pickle
from pprint import pprint
import random
import warnings
import time

# numpy, pandas, matplotlib and regular expressions (data science essentials)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

# tqdm
from tqdm import tqdm

# spacy
import spacy
from spacy.lang.en import English
import en_core_web_sm

# gensim
import gensim
from gensim import corpora
from gensim.models import CoherenceModel

# nltk
import nltk
from nltk.corpus import wordnet as wn
from nltk.corpus import words
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
from nltk.stem import LancasterStemmer

# pyLDAvis
import pyLDAvis
import pyLDAvis.gensim

# styling
pd.set_option('display.max_columns',150)
plt.style.use('bmh')
from IPython.display import display

In [2]:
df = pd.read_csv("ceo_tweets_final.csv")
df = df.drop(columns=['Unnamed: 0'])

In [3]:
df["date"] = pd.to_datetime(df['date'])

In [4]:
df = df[df['date'].dt.year>2016]

In [5]:
df["date"]=df["date"].apply(lambda x: x.date())

In [6]:
warnings.simplefilter('ignore')

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sahana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [108]:
## Removing stop words, punctuation and tokenizing
stop = stopwords.words('english')
stop = stop + ['rt','amp']

In [9]:
def get_mentions(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        result = re.findall("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_]+)", tweet) #(@[A-Za-z0-9]+)|
        return list(set(result))

In [10]:
def get_hashtags(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        result = re.findall("(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9_]+)", tweet) #(@[A-Za-z0-9]+)|
        return list(set(result))

In [8]:
def clean_tweet_split(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        p = ' '.join(re.sub("(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9_]+)", " ", tweet).split())
        s = ' '.join(re.sub("(?<=^|(?<=[^a-zA-Z0-9-_\.]))#([A-Za-z]+[A-Za-z0-9_]+)", " ", p).split())
        return ' '.join(re.sub("(\w+:\/\/\S+)", " ", s).split())

In [53]:
def remove_links(tweet):
        '''
        Utility function to clean tweet text by removing links, special characters
        using simple regex statements.
        '''
        return ' '.join(re.sub("(\w+:\/\/\S+)", " ", tweet).split())

In [14]:
lemma = nltk.wordnet.WordNetLemmatizer()
def lemmatize(text):
    return lemma.lemmatize(text)

In [11]:
def bigrams(words, bi_min=15, tri_min=10):
    bigram = gensim.models.Phrases(words, min_count = bi_min)
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    return bigram_mod

In [12]:
def get_corpus(df):
    """
    Get Bigram Model, Corpus, id2word mapping
    """
    bigram = bigrams(df.tweet_tokens_lem)
    bigram = [bigram[tweet] for tweet in df.tweet_tokens_lem]
    id2word = gensim.corpora.Dictionary(bigram)
    id2word.filter_extremes(no_below=10, no_above=0.35)
    id2word.compactify()
    corpus = [id2word.doc2bow(text) for text in bigram]
    return corpus, id2word, bigram

In [109]:
df["mentions"] = df["tweet"].apply(lambda tweet: get_mentions(tweet))
df["tags"] = df["tweet"].apply(lambda tweet: get_hashtags(tweet))

df["tweet_clean"] = df["tweet"].apply(lambda tweet: clean_tweet_split(tweet))

df["tweet_tokens"] = df["tweet_clean"].apply(lambda each_post: word_tokenize(re.sub(r'[^\w\s]',' ',each_post.lower())))
df["tweet_tokens"] = df["tweet_tokens"].apply(lambda list_of_words: [x for x in list_of_words if x not in stop])

df["tweet_tokens_lem"] = df["tweet_tokens"].apply(lambda list_of_words: [lemmatize(x) for x in list_of_words])

In [110]:
## Removing Less Frequent words
df["tweet_new"] = df["tweet"].apply(lambda tweet: remove_links(tweet))
df["tweet_new"] = df["tweet_new"].apply(lambda each_post: word_tokenize(re.sub(r'[^\w\s]',' ',each_post.lower())))
df["tweet_new"] = df["tweet_new"].apply(lambda list_of_words: [x for x in list_of_words if x not in stop])

In [16]:
df.head()

Unnamed: 0,username,id,date,tweet,retweets,likes,mentions,tags,tweet_clean,tweet_tokens,tweet_tokens_lem
0,@tim_cook,1200060640469159939,2019-11-28,"On this #Thanksgiving, I am reflecting on the ...",546,4434,[],"[Thanksgiving, Dreamers]","On this , I am reflecting on the . As we enjoy...","[reflecting, enjoy, day, friends, family, feel...","[reflecting, enjoy, day, friend, family, feel,..."
1,@tim_cook,1200047686180835328,2019-11-28,Wishing everyone a #HappyThanksgiving filled w...,575,6097,[],[HappyThanksgiving],Wishing everyone a filled with joy &amp; happi...,"[wishing, everyone, filled, joy, amp, happines...","[wishing, everyone, filled, joy, amp, happines..."
2,@tim_cook,1199872990718169089,2019-11-28,Tomorrow the incredible @MNightShyamalan’s ser...,412,2607,"[MNightShyamalan, Servant]",[],Tomorrow the incredible ’s series premieres on...,"[tomorrow, incredible, series, premieres, appl...","[tomorrow, incredible, series, premiere, apple..."
3,@tim_cook,1199855397617704970,2019-11-28,Thanksgiving Day challenge! Close your rings w...,394,3932,[],[],Thanksgiving Day challenge! Close your rings w...,"[thanksgiving, day, challenge, close, rings, f...","[thanksgiving, day, challenge, close, ring, fa..."
4,@tim_cook,1199767313890922497,2019-11-27,As many of you travel to be with loved ones to...,1310,8436,[],[],As many of you travel to be with loved ones to...,"[many, travel, loved, ones, today, remember, p...","[many, travel, loved, one, today, remember, pr..."


## LDA Analysis

In [200]:
def lda_analysis(df, username, num_topics):
    df_ceo = df[df['username']== username]
    
    all_words = df_ceo['tweet_new'].sum()
    freq_dist = nltk.FreqDist(all_words)
    df_fdist=pd.DataFrame(list(freq_dist.items()), columns=['term', 'freq'])
    
    df_fdist = df_fdist.sort_values(by = 'freq', ascending = False)
    df_fdist = df_fdist[df_fdist['freq'] > 1]
    
    relevant_words = list(df_fdist['term'])
    
    df_ceo["tweet_new"] = df_ceo["tweet_new"].apply(lambda list_of_words: [x for x in list_of_words if x in relevant_words])
    df_ceo["tweet_tokens_lem"] = df_ceo["tweet_new"].apply(lambda list_of_words: [lemmatize(x) for x in list_of_words])
    
    train_corpus, train_id2word, bigram_train = get_corpus(df_ceo)
    
    import logging
    logging.basicConfig(filename='lda_model.log', format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        lda_train = gensim.models.ldamulticore.LdaMulticore(
                               corpus=train_corpus,
                               num_topics=num_topics,
                               id2word=train_id2word,
                               chunksize=100,
                               workers=7, # Num. Processing Cores - 1
                               passes=50,
                               eval_every = 1,
                               per_word_topics=True,
                               random_state=11)
        lda_train.save('lda_train.model')
        
    coherence_model_lda = CoherenceModel(model=lda_train, texts=bigram_train, dictionary=train_id2word, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print("The coherence of the LDA model is",coherence_lda)
    
    train_vecs = []
    for i in range(len(df_ceo.tweet_new)):
        top_topics = lda_train.get_document_topics(train_corpus[i], minimum_probability=0.0)
        topic_vec = [top_topics[i][1] for i in range(num_topics)]
        train_vecs.append(topic_vec)
    
    return df_ceo, lda_train.print_topics(), train_vecs, num_topics

In [272]:
# def get_max_topics(values):
#     topics = []
#     if max(values) > 0.5:
#         topics.append(max(values))
#     elif len(list(set(values))) == 1:
#         topics = values
#     else:
#         topics = [num for num in values if num > 0.5/(num_topics-1)]   
    
#     return topics

## TRIED USING THE FUNCTION ABOVE. IT GAVE SIMILAR RESULTS AS THE BELOW FUNCTION

def get_max_topics(values):
    topics = []
    if len(list(set(values))) == 1:
        topics = values     
    else:
        topics.append(max(values))
    
    return topics

def assign_topics(col1, col2):
    if col1 in col2:
        return 1
    else:
        return 0

### Tim Cook

In [273]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@tim_cook', 8)

The coherence of the LDA model is 0.3803925329617725


In [274]:
lda_results

[(0,
  '0.103*"year" + 0.076*"see" + 0.072*"developer" + 0.058*"world" + 0.054*"time" + 0.045*"week" + 0.038*"ago" + 0.036*"app" + 0.035*"powerful" + 0.034*"today"'),
 (1,
  '0.156*"heart" + 0.085*"community" + 0.079*"family" + 0.078*"one" + 0.046*"today" + 0.046*"victim" + 0.046*"affected" + 0.043*"pro" + 0.042*"violence" + 0.041*"ipad"'),
 (2,
  '0.180*"u" + 0.060*"make" + 0.059*"every" + 0.053*"celebrate" + 0.050*"life" + 0.049*"congratulation" + 0.044*"let" + 0.040*"day" + 0.037*"today" + 0.036*"people"'),
 (3,
  '0.170*"thank" + 0.148*"work" + 0.116*"proud" + 0.090*"team" + 0.072*"great" + 0.050*"friend" + 0.041*"visit" + 0.038*"back" + 0.037*"th" + 0.034*"help"'),
 (4,
  '0.103*"woman" + 0.080*"apple" + 0.075*"never" + 0.065*"story" + 0.064*"country" + 0.062*"right" + 0.058*"men" + 0.058*"enjoy" + 0.057*"like" + 0.054*"place"'),
 (5,
  '0.208*"thanks" + 0.123*"iphone" + 0.102*"new" + 0.076*"love" + 0.058*"student" + 0.047*"forward" + 0.043*"thing" + 0.041*"shotoniphone" + 0.033*"

In [275]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Technology','Social','People','Appreciation','Women','Product','Store Launch','Emotion']

In [276]:
df_tim_cook = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)

In [278]:
df_tim_cook['all_topics']= df_tim_cook[['Technology','Social','People','Appreciation','Women','Product','Store Launch','Emotion']].values.tolist()
df_tim_cook['max_topics'] = df_tim_cook['all_topics'].apply(lambda values: get_max_topics(values))

In [279]:
df_tim_cook['Technology'] = df_tim_cook.apply(lambda x: assign_topics(x['Technology'], x['max_topics']), axis=1)
df_tim_cook['Social'] = df_tim_cook.apply(lambda x: assign_topics(x['Social'], x['max_topics']), axis=1)
df_tim_cook['People'] = df_tim_cook.apply(lambda x: assign_topics(x['People'], x['max_topics']), axis=1)
df_tim_cook['Appreciation'] = df_tim_cook.apply(lambda x: assign_topics(x['Appreciation'], x['max_topics']), axis=1)
df_tim_cook['Women'] = df_tim_cook.apply(lambda x: assign_topics(x['Women'], x['max_topics']), axis=1)
df_tim_cook['Product'] = df_tim_cook.apply(lambda x: assign_topics(x['Product'], x['max_topics']), axis=1)
df_tim_cook['Store Launch'] = df_tim_cook.apply(lambda x: assign_topics(x['Store Launch'], x['max_topics']), axis=1)
df_tim_cook['Emotion'] = df_tim_cook.apply(lambda x: assign_topics(x['Emotion'], x['max_topics']), axis=1)

In [280]:
average_topic_weights = df_tim_cook[['Technology','Social','People','Appreciation','Women','Product','Store Launch','Emotion']].sum(axis=0)

In [281]:
ceo_topics = pd.DataFrame(average_topic_weights)

In [256]:
import chart_studio.plotly as py
py.plotly.tools.set_credentials_file(username='sah_lumos', api_key='9fCFTwIksEv3WNQFIZSL')

In [282]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

colors = Spectral_8.hex_colors

topics_pie = go.Pie(labels=ceo_topics.index, values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Tim Cook')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')


In [310]:
hashtags = df_tim_cook['tags'].apply(pd.Series).stack()

hashtags_df= pd.DataFrame(hashtags)
hashtags_df.columns=['hashtags']

In [311]:
hashtags_df = pd.DataFrame(hashtags_df['hashtags'].value_counts()).reset_index()

In [321]:
hashtags_df['index'] = hashtags_df['index'].apply(lambda x:"Holiday Celebration" if any(y in x.lower() for y in ["easter","day","diwali","july","month","year","thanksgiving","week"]) else x)
hashtags_df['index'] = hashtags_df['index'].apply(lambda x:"Apple" if any(y in x.lower() for y in ["apple","airpod","iphone","ipad","potrait"]) else x)

In [325]:
hashtag_df = pd.DataFrame(hashtags_df["index"].value_counts()).reset_index()
hashtag_df.columns= ["hashtags","count"]

In [337]:
print(((hashtag_df[hashtag_df["hashtags"] == 'Holiday Celebration']["count"]/hashtag_df["count"].sum())*100).values[0],"% of Tim Cook's hashtags are about general wishes on holidays")

38.613861386138616 % of Tim Cook's hashtags are about general wishes on holidays


In [338]:
print(((hashtag_df[hashtag_df["hashtags"] == 'Apple']["count"]/hashtag_df["count"].sum())*100).values[0],"% of Tim Cook's hashtags are about apple products")

11.881188118811881 % of Tim Cook's hashtags are about apple products


### Bill Gates

In [283]:
df_ceo, lda_results, train_vecs, num_topics = lda_analysis(df, '@BillGates', 6)

The coherence of the LDA model is 0.39186093801872235


In [284]:
lda_results

[(0,
  '0.150*"one" + 0.072*"year" + 0.071*"book" + 0.044*"lot" + 0.027*"recently" + 0.027*"favorite" + 0.027*"read" + 0.023*"last" + 0.021*"never" + 0.020*"optimistic"'),
 (1,
  '0.061*"need" + 0.046*"vaccine" + 0.040*"alzheimer" + 0.039*"excited" + 0.030*"look" + 0.028*"government" + 0.028*"new" + 0.028*"ever" + 0.028*"disease" + 0.027*"world"'),
 (2,
  '0.058*"life" + 0.054*"great" + 0.045*"work" + 0.044*"melinda" + 0.037*"student" + 0.034*"best" + 0.034*"new" + 0.033*"could" + 0.033*"aid" + 0.031*"learn"'),
 (3,
  '0.049*"like" + 0.040*"melindagates" + 0.035*"warrenbuffett" + 0.035*"future" + 0.034*"time" + 0.032*"know" + 0.032*"story" + 0.032*"think" + 0.029*"new" + 0.028*"hope"'),
 (4,
  '0.092*"world" + 0.068*"progress" + 0.056*"health" + 0.036*"see" + 0.035*"u" + 0.033*"global" + 0.032*"making" + 0.031*"incredible" + 0.026*"always" + 0.022*"child"'),
 (5,
  '0.063*"people" + 0.047*"world" + 0.046*"energy" + 0.045*"today" + 0.044*"help" + 0.035*"get" + 0.033*"thing" + 0.027*"pov

In [285]:
train_vec_df=pd.DataFrame(train_vecs)
train_vec_df.columns=['Book Recommendations','Diseases/Vaccines','Education','Warren Buffet','General World Issues','Renewable Energy']

In [286]:
df_bill_gates = pd.concat([df_ceo.reset_index(drop=True), train_vec_df.reset_index(drop=True)], axis=1)
df_bill_gates['all_topics']= df_bill_gates[['Book Recommendations','Diseases/Vaccines','Education','Warren Buffet','General World Issues','Renewable Energy']].values.tolist()
df_bill_gates['max_topics'] = df_bill_gates['all_topics'].apply(lambda values: get_max_topics(values))

df_bill_gates['Book Recommendations'] = df_bill_gates.apply(lambda x: assign_topics(x['Book Recommendations'], x['max_topics']), axis=1)
df_bill_gates['Diseases/Vaccines'] = df_bill_gates.apply(lambda x: assign_topics(x['Diseases/Vaccines'], x['max_topics']), axis=1)
df_bill_gates['Education'] = df_bill_gates.apply(lambda x: assign_topics(x['Education'], x['max_topics']), axis=1)
df_bill_gates['Warren Buffet'] = df_bill_gates.apply(lambda x: assign_topics(x['Warren Buffet'], x['max_topics']), axis=1)
df_bill_gates['General World Issues'] = df_bill_gates.apply(lambda x: assign_topics(x['General World Issues'], x['max_topics']), axis=1)
df_bill_gates['Renewable Energy'] = df_bill_gates.apply(lambda x: assign_topics(x['Renewable Energy'], x['max_topics']), axis=1)

In [287]:
average_topic_weights = df_bill_gates[['Book Recommendations','Diseases/Vaccines','Education','Warren Buffet','General World Issues','Renewable Energy']].sum(axis=0)
ceo_topics = pd.DataFrame(average_topic_weights)

In [288]:
import plotly.graph_objs as go
from palettable.colorbrewer.diverging import *

colors = Spectral_8.hex_colors

topics_pie = go.Pie(labels=ceo_topics.index, values=ceo_topics[0], marker=dict(colors=colors
                                                            , line=dict(color='#FFF', width=2)),
                                                            domain={'x': [0.0, .4], 'y': [0.0, 1]}
                                                            , showlegend=False, textinfo='label+percent')

layout = go.Layout(height = 600,
                   width = 1000,
                   autosize = False,
                   title = 'Topic Distribution for Bill Gates')
fig = go.Figure(data = topics_pie, layout = layout)

py.iplot(fig, filename='basic_pie_chart')

In [340]:
hashtags = df_bill_gates['tags'].apply(pd.Series).stack()

hashtags_df= pd.DataFrame(hashtags)
hashtags_df.columns=['hashtags']

In [341]:
pd.DataFrame(hashtags_df['hashtags'].value_counts())

Unnamed: 0,hashtags
Goalkeepers18,8
MosquitoWeek,4
MomentofLift,4
VR,4
Alzheimers,3
dataviz,3
IWD2019,2
malaria,2
WorldMalariaDay,2
TBT,2
