# Twitter data mining

### References

https://marcobonzanini.com/2015/03/02/mining-twitter-data-with-python-part-1/  
https://towardsdatascience.com/mining-twitter-data-ba4e44e6aecc  
https://towardsdatascience.com/@rickykim78  
https://towardsdatascience.com/tweepy-for-beginners-24baf21f2c25  
https://amueller.github.io/word_cloud/auto_examples  


In [None]:
# import sys
# !{sys.executable} -m pip install tweepy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tweepy
import json 
import datetime
import time
import seaborn as sns
import wordcloud

### Login to Twitter API

In [None]:
from auth_ap import *

In [None]:
# Creating the authentication object
auth = tweepy.OAuthHandler(auth_ap.consumer_key, auth_ap.consumer_secret)
# Setting your access token and secret
auth.set_access_token(auth_ap.access_token, auth_ap.access_token_secret)
# Creating the API object while passing in auth information
api = tweepy.API(auth) 

### Explore API

In [None]:
# Using the API object to get tweets from your timeline, and storing it in a variable called public_tweets
public_tweets = api.home_timeline()
# foreach through all tweets pulled
for tweet in public_tweets:
   # printing the text stored inside the tweet object
   print (tweet.text)

What data are available from tweets?

In [None]:
status = tweet
json_str = json.dumps(status._json)

#deserialise string into python object
parsed = json.loads(json_str)

print(json.dumps(parsed, indent=4, sort_keys=True))

In [None]:
tweet.place

### Miner

In [None]:
class TweetMiner(object):
    import auth_ap
    # number of tweets per one pull; there is limit on this
    result_limit    =   20    
    data            =   []
    api             =   False
    
    twitter_keys = {
        'consumer_key':        auth_ap.consumer_key,
        'consumer_secret':     auth_ap.consumer_secret,
        'access_token_key':    auth_ap.access_token,
        'access_token_secret': auth_ap.access_token_secret
    }
    
    
    def __init__(self, keys_dict=twitter_keys, api=api, result_limit = 20):
        
        self.twitter_keys = keys_dict
        
        auth = tweepy.OAuthHandler(keys_dict['consumer_key'], keys_dict['consumer_secret'])
        auth.set_access_token(keys_dict['access_token_key'], keys_dict['access_token_secret'])
        
        self.api = tweepy.API(auth)
        self.twitter_keys = keys_dict
        
        self.result_limit = result_limit

        
    def tweets_to_dict(self, statuses, incl_retweets = True):
        data = []
        for item in statuses:
            
            mined = {
                'tweet_id':        item.id,
                'name':            item.user.name,
                'screen_name':     item.user.screen_name,            # username
                "followers_count": item.user.followers_count,
                "friends_count":   item.user.friends_count,
                'retweet_count':   item.retweet_count,
                'text':            item.full_text,
                'mined_at':        datetime.datetime.now(),
                'created_at':      item.created_at,
                'favourite_count': item.favorite_count,              # # of likes
                'hashtags':        item.entities['hashtags'],
                'status_count':    item.user.statuses_count,         # # of tweeets
                'location':        item.place,
                'source_device':   item.source
            }
            if incl_retweets:
                try:
                    mined['retweet_text'] = item.retweeted_status.full_text
                except:
                    mined['retweet_text'] = 'None'
                try:
                    mined['quote_text'] = item.quoted_status.full_text
                    mined['quote_screen_name'] = status.quoted_status.user.screen_name
                except:
                    mined['quote_text'] = 'None'
                    mined['quote_screen_name'] = 'None'
            data.append(mined)
        return data, statuses[-1].id
        
    def mine_tweets_user(self, user="",
                         incl_retweets = True, last_tweet_id  =  False,
                         max_pages=17):

        data_page = []
        # keep track of last tweet id
        
        # multiply by the # of result_limit = total tweets
        page           =  1
        
        while page <= max_pages:
            if last_tweet_id:
                statuses   =   self.api.user_timeline(screen_name = user,
                                                     count = self.result_limit,
                                                     # get tweets older than last retrieved ones  
                                                     max_id = last_tweet_id - 1,
                                                     tweet_mode = 'extended',
                                                     include_retweets = incl_retweets
                                                    )        
            else:
                statuses   =   self.api.user_timeline(screen_name=user,
                                                        count = self.result_limit,
                                                        tweet_mode = 'extended',
                                                        include_retweets = incl_retweets)
                
            data, last_tweet_id = self.tweets_to_dict(statuses, incl_retweets)
            # need item to keep track of the last tweet id
            
            
            data_page.extend(data)
            page += 1
        # returns list of dict
        return data_page, last_tweet_id
    
    def mine_tweets_keyword(self, query = "", language = 'en',
                         incl_retweets = True, last_tweet_id  =  False,
                         max_pages=17):

        data_page           =  []
        # keep track of last tweet id
        
        # multiply by the # of result_limit = total tweets
        page           =  1
        
        while page <= max_pages:
            if last_tweet_id:
                statuses   =   self.api.search(q = query, lang = language,
                                                     count = self.result_limit,
                                                     # get tweets older than last retrieved ones  
                                                     max_id = last_tweet_id - 1,
                                                     tweet_mode = 'extended',
                                                     include_retweets = incl_retweets
                                                    )        
            else:
                statuses   =   self.api.search(q = query, lang = language,
                                                        count = self.result_limit,
                                                        tweet_mode = 'extended',
                                                        include_retweets = incl_retweets)
    
    
            data, last_tweet_id = self.tweets_to_dict(statuses, incl_retweets)
            # need item to keep track of the last tweet id
            
#             print (len(data))
            data_page.extend(data)
                
            page += 1
        # returns list of dict
        return data_page, last_tweet_id

In [None]:
last_id = False
miner = TweetMiner(result_limit = 1)
mined_tweets, last_tweet_id = miner.mine_tweets_user(user='nytimes', max_pages = 17, 
                                                         last_tweet_id = last_id, incl_retweets=True)

In [None]:
miner = TweetMiner(result_limit = 5)
last_id = False
for i in range(2):
    
    mined_tweets, last_tweet_id = miner.mine_tweets_user(user='nytimes', max_pages = 17, 
                                                         last_tweet_id = last_id, incl_retweets=True)
    last_id = last_tweet_id
#     mined_tweets_df = pd.DataFrame(mined_tweets)
    print (mined_tweets[0]['retweet_text'])
    print ("last id", last_id)
    break

In [None]:
mined_tweets[0]

In [None]:
search_tweets = api.user_timeline('nytimes',count=3,tweet_mode='extended')
for tweet in search_tweets:
    if 'retweeted_status' in tweet._json:
        print(tweet._json['retweeted_status']['full_text'])
    else:
        print(tweet.full_text)

### Make call every 15 min

In [None]:
import time

handle_list= ['list of handles you want the timelines of']

twitter_dict={}
counter=0

for name in handle_list:
    try:
      twitter_dict[name]=[]
      twitter_dict[name].append(miner.mine_user_tweets(user=name, max_pages=17))
      counter = counter +1
      if counter%40==0:
        time.sleep(900) #15 minute sleep time
    #if name invalid print name and remove key
    except:
      print(name, 'is invalid or locked')
      twitter_dict.pop(name)
    
all_tweets=pd.concat([pd.DataFrame(twitter_dict[i][0]) for i in twitter_dict])

### Mine by keyword

In [None]:

last_id = False
mined_tweets, last_tweet_id = miner.mine_tweets_keyword(query='AR/VR', language = 'en', 
                               last_tweet_id = last_id, incl_retweets = True, max_pages = 34)

In [None]:
len(mined_tweets)

In [None]:
miner = TweetMiner(result_limit = 100)
counter = 1
ls = []
last_id = False
while counter < 15:
    print (counter)
    try:
        
        mined_tweets, last_tweet_id = miner.mine_tweets_keyword(query='AR/VR', language = 'en', 
                                       last_tweet_id = last_id, incl_retweets = True, max_pages = 34)
        last_id = last_tweet_id

        ls.extend(mined_tweets)
    except:
        print ("Limit is reached")
        break
    if i % 4 == 0:
        mined_tweets_df = pd.DataFrame(ls)
        mined_tweets_df.to_pickle("twitter{0}.pkl".format(i))
        time.sleep(16*60) #15 minute sleep time
        print (i)
        ls = []
    counter+=1

### Get replies to tweets

Try twarc Python package replies  
It might be interesting to look at tweets that got many replies  
What type of tweets get what type of replies?

In [None]:
df_tweets.head()

In [None]:
# Creating the authentication object
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
# Setting your access token and secret
auth.set_access_token(access_token, access_token_secret)
# Creating the API object while passing in auth information
api = tweepy.API(auth) 

In [None]:
for idx in range(10000, 10100):    
    user = df_tweets.iloc[idx].screen_name
    tweet_id = df_tweets.iloc[idx].tweet_id
    max_id = None

    replies=[]
    for tweet in tweepy.Cursor(api.search, q='to:' + user, since_id = tweet_id, max_id = max_id, timeout=999999).items(100):
        if hasattr(tweet, 'in_reply_to_status_id_str'):
            if (tweet.in_reply_to_status_id_str==tweet_id):
                replies.append(tweet)
    
    if len(replies) > 0:
        print (len(replies))


In [None]:
df_tweets.iloc[10000]

In [None]:
user = tweet.user.screen_name
tweet_id = tweet.id
max_id = None

while True:

    try:
        replies = t.GetSearch(raw_query=q, since_id=tweet_id, max_id=max_id, count=100)
    except twitter.error.TwitterError as e:
        logging.error("caught twitter api error: %s", e)
        time.sleep(60)
        continue
    for reply in replies:
        logging.info("examining: %s" % tweet_url(reply))
        if reply.in_reply_to_status_id == tweet_id:
            logging.info("found reply: %s" % tweet_url(reply))
            yield reply
            # recursive magic to also get the replies to this reply
            for reply_to_reply in get_replies(reply):
                yield reply_to_reply
        max_id = reply.id

### Text processing

In [None]:
from nltk.tokenize import word_tokenize
import re
import operator 
from collections import Counter
from nltk.corpus import stopwords
import string

In [None]:
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
 
def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=True):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens

In [None]:
df_tweets.head()

#### Term frequency

In [None]:
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', 'vr', 'ar', 
                '#vr', '#ar', "#virtualreality", "#augmentedreality", '’', '#mr', '#ai', '#ml', '#3d',
                '"', 'virtual', 'augmented', "cc", "amp" ]


In [None]:
count_terms = Counter()
count_hash = Counter()
count_users = Counter()
for idx in range(df_tweets.shape[0]):
    text = df_tweets.iloc[idx].retweet_text 
    if text == "None":
        text = df_tweets.iloc[idx].text
    # Create a list with all the terms
#     terms_all = [term for term in preprocess(text) if term not in stop]
    terms_hash = [term for term in preprocess(text) 
              if term not in stop and term.startswith('#')]
    terms_only = [term for term in preprocess(text) 
              if term not in stop and
              not term.startswith(('#', '@'))] 
    users = [term for term in preprocess(text) 
              if term not in stop and term.startswith('@')]
    # Update the counter
    count_users.update(users)
    count_hash.update(terms_hash)
    count_terms.update(terms_only)
    # Print the first 5 most frequent words
print(count_users.most_common(10))
print(count_hash.most_common(10))
print(count_terms.most_common(10))


#### Wordclouds

In [None]:
x, y = np.ogrid[:300, :300]

mask = (x - 150) ** 2 + (y - 150) ** 2 > 130 ** 2
mask = 255 * mask.astype(int)

_input = count_terms
wc = WordCloud( mask = mask, contour_width=3, contour_color= 'steelblue',
                background_color ='white', max_font_size=50, 
                max_words=200, random_state=42, 
                min_font_size = 10).generate_from_frequencies(_input) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wc, interpolation='bilinear') 
plt.axis("off") 
plt.tight_layout(pad = 0) 

#### Woedcloud with mask 

In [None]:
from PIL import Image
from wordcloud import WordCloud
path_image = r"C:\Users\Chub_lab\Desktop\V0D-sLDA.jpeg.jpg"
mask = np.array(Image.open(path_image))

In [None]:
_input = count_hash
wc = WordCloud( mask = mask, contour_width=3, contour_color= 'steelblue',
                background_color ='white', max_font_size=100, 
                max_words=200, random_state=42, 
                min_font_size = 10).generate_from_frequencies(_input) 
  
# plot the WordCloud image                        
plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wc, interpolation='bilinear') 
plt.axis("off") 
plt.tight_layout(pad = 0) 

#### Term co-occurrences

In [None]:
from nltk import bigrams 
count_terms_bigrams = Counter()

for idx in range(df_tweets.shape[0]-1000):
    text = df_tweets.iloc[idx].retweet_text 
    if text == "None":
        text = df_tweets.iloc[idx].text
    # Create a list with all the terms

    terms_only = [term for term in preprocess(text) 
              if term not in stop and
              not term.startswith(('#', '@'))] 
    terms_bigram = bigrams(terms_only)
    # Update the counter
    count_terms_bigrams.update(terms_bigram)
    # Print the first 5 most frequent words
print (count_terms_bigrams.most_common(10))

In [None]:
from collections import defaultdict
# remember to include the other import from the previous post
 
com = defaultdict(lambda : defaultdict(int))
for idx in range(df_tweets.shape[0]):
    text = df_tweets.iloc[idx].retweet_text 
    if text == "None":
        text = df_tweets.iloc[idx].text
    # Create a list with all the terms

    terms_only = [term for term in preprocess(text) 
              if term not in stop and
              not term.startswith(('#', '@'))]  

    # Build co-occurrence matrix
    for i in range(len(terms_only)-1):            
        for j in range(i+1, len(terms_only)):
            w1, w2 = sorted([terms_only[i], terms_only[j]])                
            if w1 != w2:
                com[w1][w2] += 1

In [None]:
com_max = []
# For each term, look for the most common co-occurrent terms
for t1 in com:
    t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        com_max.append(((t1, t2), t2_count))
# Get the most frequent co-occurrences
terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)
print(terms_max[:5])

#### Term freqeuncy over time

In [None]:
my_dict = {}
target_ls = ['#stpiindia', '#fintech', "#blockchain", "#iot"]
for trg in target_ls:
    my_dict[trg] = []

# f is the file pointer to the JSON data set
for idx in range(df_tweets.shape[0]):
    tweet = df_tweets.iloc[idx]
    text = tweet.retweet_text 
    if text == "None":
        text = df_tweets.iloc[idx].text
    # Create a list with all the terms

    terms_only = [term for term in preprocess(text) 
              if term not in stop and
              term.startswith(('#'))]  
    # track when the hashtag is mentioned
    for trg in target_ls: 
        if trg in terms_only:
            my_dict[trg].append(tweet['created_at'])
 


In [None]:
ls = []
for key in my_dict.keys():
    tmp = my_dict[key]
    ones = [1]*len(tmp)
    idx = pd.DatetimeIndex(tmp)
    tmp = pd.Series(ones, index=idx)
    tmp = tmp.resample('30Min').sum().fillna(0).reset_index()
    tmp.columns = ['date', 'freq']
    tmp.loc[:, 'target'] = key
    ls.append(tmp)
df_target_tc = pd.concat(ls)

In [None]:
df_tweet.location

#### Plot tweet frequency over time

In [None]:
from matplotlib.dates import DateFormatter

In [None]:
sns.set_context('poster')
f, ax = plt.subplots(figsize =(8, 6))
ax = sns.lineplot(data = df_target_tc, x = 'date', y = 'freq', hue = 'target')
sns.despine()

date_form = DateFormatter("%m-%d")
ax.xaxis.set_major_formatter(date_form)
ax.set_xlabel('Date')
ax.set_ylabel('Tweet count')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.xticks(rotation = 45)

In [None]:
df_freq

### Sentiment analysis

In [None]:
# nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sian = SentimentIntensityAnalyzer()

In [None]:
def dict_to_list(dict1):
    dictlist = list()
    for key, value in dict1.items():
        temp = [key,value]
        dictlist.append(temp)
    return dictlist

In [None]:
df_tweets.head()

In [None]:
d_compound

In [None]:
d_compound = {}
tweet_sentim = []
for idx in range(df_tweets.shape[0]):
    text = df_tweets.iloc[idx].retweet_text 
    if text == "None":
        text = df_tweets.iloc[idx].text
    tweet_id = df_tweets.iloc[idx].tweet_id
    d_compound[tweet_id] = sian.polarity_scores(text)['compound']
#     dict_res = dict_to_list(sian.polarity_scores(text))
    #nltk.sentiment.util.demo_vader_instance(_)
#     tweet_sentim.append([text, dict_res[0][1], dict_res[1][1], dict_res[2][1], dict_res[3][1]])

# df_sentim = pd.DataFrame(tweet_sentim)
# df_sentim.columns = ['text', 'neg', 'neu', 'pos', 'compound']
# df_sentim.head()

from https://github.com/cjhutto/vaderSentiment  
    The compound score is computed by summing the valence scores of each word in the lexicon, adjusted according to the rules, and then normalized to be between -1 (most extreme negative) and +1 (most extreme positive). This is the most useful metric if you want a single unidimensional measure of sentiment for a given sentence. Calling it a 'normalized, weighted composite score' is accurate.

    It is also useful for researchers who would like to set standardized thresholds for classifying sentences as either positive, neutral, or negative. Typical threshold values (used in the literature cited on this page) are:

        positive sentiment: compound score >= 0.05
        neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
        negative sentiment: compound score <= -0.05



In [None]:
df_sentim = df_sentim.drop_duplicates()
df_sentim = df_sentim.sort_values(['compound'], ascending=False)
df_sentim.head()

In [None]:
df_sentim.head()

In [None]:
for i in range(5):
    text = df_sentim[df_sentim['compound'] < -0.5].iloc[i].text
    print (text)

In [None]:
sns.distplot(df_sentim['compound'])
sns.despine()
plt.axvline(x = -0.05, linestyle = '--', color = 'k')
plt.axvline(x = 0.05, linestyle = '--', color = 'k')

In [None]:
df_tweets.loc[:, 'compound'] = df_tweets.tweet_id.map(d_compound)

In [None]:
src_dev = df_tweets.groupby(['source_device']).tweet_id.count().reset_index()
toi = src_dev[src_dev.tweet_id > 100].source_device.unique()


Does the soruce device correlate with senimnet polarity?

In [None]:
_input = df_tweets[df_tweets.source_device.isin(toi)].sort_values(by = ['source_device'])
sns.catplot(data = _input, y = 'source_device', x = 'compound', kind = 'violin',
           height = 6, aspect = 1.4, orient = 'h')
# plt.xticks(rotation= 30)

In [None]:
df_tweets.head()

#### Cluster users by bio

It is better to save tweets to .json and load what you need for the analysis 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

def get_parser():
    parser = ArgumentParser("Clustering for followers")
    parser.add_argument('--filename') 
    parser.add_argument('--k', type=int) 
    parser.add_argument('--min-df', type=int, default=2) 
    parser.add_argument('--max-df', type=float, default=0.8) 
    parser.add_argument('--max-features', type=int, default=None) 
    parser.add_argument('--no-idf', dest='user_idf', default=True, action='store_false') 
    parser.add_argument('--min-ngram', type=int, default=1) 
    parser.add_argument('--max-ngram', type=int, default=1) 
    return parserif __name__ == '__main__':
    parser = get_parser()
    args = parser.parse_args()
    if args.min_ngram > args.max_ngram:
        print("Error: incorrect value for --min--ngram ({}): it cant be higher than \
        --max--value ({})".format(args.min_ngram, args.max_ngram))
        sys.exit(1)
    with open(args.filename) as f:
        #load datausers = []
        for line in f:
            profile = json.loads(line)
            users.append(profile['description'])
        #create vectorizer
        vectorizer = TfidfVectorizer(max_df=args.max_df,
                                    min_df=args.min_df,
                                    max_features=args.max_features,
                                    stop_words='english',
                                    ngram_range=(args.min_ngram, args.max_ngram),
                                    use_idf=args.user_idf)#fit data
        X = vectorizer.fit_transform(users)
        print("Data dimensions: {}".format(X.shape))#perform clustering
        km = KMeans(n_clusters=args.k)
        km.fit(X)
        clusters = defaultdict(list)
        for i, label in enumerate(km.labels_):
            clusters[label].append(users[i])#print 10 user description of this clusterfor label, description in clusters.items():
            print("--------- Cluster {}".format(label+i))
            for desc in description[:10]:
                print(desc)