In [1]:
%%capture
try:
    from pip import main as pipmain
except:
    from pip._internal import main as pipmain
packages = ['twython', 'pandas', 'psycopg2-binary']
pipmain(['install'] + packages)

In [27]:
%%capture
from twython import Twython

from nltk.tokenize import casual_tokenize
import nltk
nltk.download('averaged_perceptron_tagger')

import pandas as pd
import psycopg2

In [37]:
import sys
sys.path.append('../')
from auth import consumer_key, consumer_secret, access_token, access_token_secret

twitter = Twython(consumer_key, consumer_secret, access_token, access_token_secret)
from aws import host as ahost, port as aport, user as auser, password as apassword, database as adatabase

connection = psycopg2.connect(host = ahost, 
                              port = aport, 
                              user = auser, 
                              password = apassword, 
                              dbname = adatabase)
cursor = connection.cursor()

In [4]:
PRINT_LIMIT = 50
FOLLOWER_SAMPLE_LIMIT = 100 # number of followers to randomly sample
WORD_FREQ_LIMIT = 10 # return this number of topics that are most freq

REGULAR_NOUN_WORD_TYPE = 0
PROPER_NOUN_WORD_TYPE = 1

In [34]:
def read_try(sql):
    try:
        df = pd.read_sql(sql, con=connection)
        return pd.DataFrame() if df.empty else df
    except Exception as e:
        #print("READ ERROR", e)
        return pd.DataFrame()

def write_try(sql):
    try:
        cursor.execute(sql)  # run a psql command
        return True
    except Exception as e:
        #print("WRITE ERROR: ", e)
        return False
    finally:
        connection.commit()
        
# get all the users from the users table
def read_all_users_from_db():
    sql = 'SELECT * FROM users'
    return read_try(sql)

# get all the follower, user pairs from the followers table
def read_all_followers_from_db():
    sql = 'SELECT * FROM followers'
    return read_try(sql)

# get all the tweets from tweets table
def read_all_tweets_from_db():
    sql = 'SELECT * FROM tweets'
    return read_try(sql)

# get all word counts of tweet text
def read_all_words_from_db():
    sql = 'SELECT * FROM words'
    return read_try(sql)

# get all hashtag counts
def read_all_hashtags_from_db():
    sql = 'SELECT * FROM hashtags'
    return read_try(sql)

def read_user_from_db(user):
    sql = 'SELECT * FROM users WHERE user_handle = \'{}\''.format(user)
    return read_try(sql)

# get list of followers for a user
def read_user_followers_from_db(user, limit = 'NULL'):
    sql = 'SELECT follower_handle FROM followers WHERE user_handle = \'{}\'LIMIT {}'.format(user, limit)
    return read_try(sql)

# see if tweet is already indexed in DB
def read_tweet_from_db(tweet_id):
    sql = ('SELECT tweet_date FROM tweets '
    'WHERE tweet_id = {}'.format(tweet_id)
    )
    return read_try(sql)

# get list of word counts for a user
def read_user_words_from_db(user, limit = 'NULL', date = ''):
    sql = ('SELECT word, SUM(count) as sum_count FROM followers '
    'JOIN tweets ON tweets.follower_handle = followers.follower_handle '
    'JOIN words ON words.tweet_id = tweets.tweet_id '
    'WHERE followers.user_handle = \'{}\' GROUP BY word ORDER BY sum_count desc LIMIT {}'.format(user, limit)
    )
    return read_try(sql)

# get list of word counts for a user
def read_user_proper_words_from_db(user, limit = 'NULL', date = ''):
    sql = ('SELECT word, SUM(count) as sum_count FROM followers '
    'JOIN tweets ON tweets.follower_handle = followers.follower_handle '
    'JOIN words ON words.tweet_id = tweets.tweet_id '
    'WHERE followers.user_handle = \'{}\' '.format(user))
    sql += 'GROUP BY word HAVING MAX(words.word_type) = {} '.format(PROPER_NOUN_WORD_TYPE)
    sql += 'ORDER BY sum_count desc LIMIT {} '.format(limit)
    return read_try(sql)

# get list of hashtag counts for a user
def read_user_hashtags_from_db(user, limit = 'NULL', date = ''):
    sql = ('SELECT word, SUM(count) as sum_count FROM followers '
    'JOIN tweets ON tweets.follower_handle = followers.follower_handle '
    'JOIN hashtags ON hashtags.tweet_id = tweets.tweet_id '
    'WHERE followers.user_handle = \'{}\' GROUP BY word ORDER BY sum_count desc LIMIT {}'.format(user, limit)
    )
    return read_try(sql)

def write_user_to_db(user):
    sql = 'INSERT INTO users VALUES (\'{}\');'.format(user)
    write_try(sql)

def write_user_followers_to_db(followers, user):
    for follower in followers:
        sql = 'INSERT INTO followers VALUES (\'{}\', \'{}\');'.format(follower, user)
        write_try(sql)

In [6]:
user = "ewarren"

In [7]:
followers = twitter.get_followers_list(screen_name = user)['users']
print(followers)

[{'id': 1198507703515172865, 'id_str': '1198507703515172865', 'name': 'maythym', 'screen_name': 'maythym1', 'location': '', 'description': 'Maythem', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 0, 'friends_count': 13, 'listed_count': 0, 'created_at': 'Sun Nov 24 07:45:34 +0000 2019', 'favourites_count': 0, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 0, 'lang': None, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'F5F8FA', 'profile_background_image_url': None, 'profile_background_image_url_https': None, 'profile_background_tile': False, 'profile_image_url': 'http://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png', 'profile_image_url_https': 'https://abs.twimg.com/sticky/default_profile_images/default_profile_normal.png', 'profile_link_color': '1DA1F2', 'profile_sidebar_border_color': 'C0DEED',

In [8]:
followers_list = []
for follower in followers:
    followers_list.append(follower['screen_name'])
print(followers_list)

['maythym1', 'banks_mikado', 'patti_mccune', 'MarcantelRoland', 'hussein61091233', 'SarenSok', 'desolates0ul', 'ThornbyD', 'imaginaimage', 'kwiatek_dorota', 'jacobolto', 'ayoubchaabene', 'MrRease1911', 'Libertycrazy97', 'cassie17011071', 'xzhoush', 'AveryHannig', 'RylandLori', 'travo245', 'bluesrider']


In [9]:
# add user to users db
write_user_to_db(user)
read_all_users_from_db()

Unnamed: 0,user_handle
0,test_user
1,test_user1
2,realDonaldTrump
3,AndrewYang
4,ewarren


In [10]:
# add followers followers db
write_user_followers_to_db(followers_list, user)
read_all_followers_from_db()

Unnamed: 0,follower_handle,user_handle
0,test_follower1,test_user
1,test_follower2,test_user
2,test_follower1,test_user1
3,james_graziano,AndrewYang
4,rbrsq,AndrewYang
...,...,...
72,Libertycrazy97,ewarren
73,maythym1,ewarren
74,banks_mikado,ewarren
75,patti_mccune,ewarren


In [11]:
def get_tweets_from(follower_name, results = []):
    try: 
        result = twitter.get_user_timeline(screen_name = follower_name)
        results.extend(result)
        return results
    except:
        return results

In [12]:
def get_tweet_info_from_tweets(results):
    tweets = []
    for tweet in results:
        tweet_map = {'text': tweet['text']}
        tweet_map['id'] = tweet['id']
        tweet_map['date'] = tweet['created_at']
        tweet_map['user'] = tweet['user']['screen_name']
        tweet_hashtags = []
        tweet_urls = []
        tweet_user_mentions = []
        
        entities = tweet['entities']
    
        for hashtags in entities['hashtags']:
            tweet_hashtags.append(hashtags['text'])
        for urls in entities['urls']:
            tweet_urls.append(urls['url'])
        try:
            for media in entities['media']:
                tweet_urls.append(media['url'])
        except:
            pass
        for users in entities['user_mentions']:
            tweet_user_mentions.append(users['screen_name'])
            
        tweet_map["hashtags"] = tweet_hashtags
        tweet_map["urls"] = tweet_urls
        tweet_map["user_mentions"] = tweet_user_mentions
        tweets.append(tweet_map)
        
    return tweets

In [13]:
results = []
for f in followers_list:
    results = get_tweets_from(f, results)
print(results)

[{'created_at': 'Sat Nov 23 23:00:08 +0000 2019', 'id': 1198375711133884416, 'id_str': '1198375711133884416', 'text': '@adelebeaumtl Happy birthday to you miss', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'adelebeaumtl', 'name': 'Adèle Beaumont | Bday month 🕊', 'id': 1042382836022616067, 'id_str': '1042382836022616067', 'indices': [0, 13]}], 'urls': []}, 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>', 'in_reply_to_status_id': 1198257088448942080, 'in_reply_to_status_id_str': '1198257088448942080', 'in_reply_to_user_id': 1042382836022616067, 'in_reply_to_user_id_str': '1042382836022616067', 'in_reply_to_screen_name': 'adelebeaumtl', 'user': {'id': 1147431332311719936, 'id_str': '1147431332311719936', 'name': 'Mikado Banks', 'screen_name': 'banks_mikado', 'location': '', 'description': '', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 4

In [14]:
tweets_data = get_tweet_info_from_tweets(results)
tweets_data

[{'text': '@adelebeaumtl Happy birthday to you miss',
  'id': 1198375711133884416,
  'date': 'Sat Nov 23 23:00:08 +0000 2019',
  'user': 'banks_mikado',
  'hashtags': [],
  'urls': [],
  'user_mentions': ['adelebeaumtl']},
 {'text': '@StonerGemini So beautiful',
  'id': 1198375272934002688,
  'date': 'Sat Nov 23 22:58:24 +0000 2019',
  'user': 'banks_mikado',
  'hashtags': [],
  'urls': [],
  'user_mentions': ['StonerGemini']},
 {'text': '@Pinkbee_ Happy birthday to you miss',
  'id': 1198372948207448065,
  'date': 'Sat Nov 23 22:49:10 +0000 2019',
  'user': 'banks_mikado',
  'hashtags': [],
  'urls': [],
  'user_mentions': ['Pinkbee_']},
 {'text': '@ManUtdWomen What win for them',
  'id': 1198178871788613632,
  'date': 'Sat Nov 23 09:57:58 +0000 2019',
  'user': 'banks_mikado',
  'hashtags': [],
  'urls': [],
  'user_mentions': ['ManUtdWomen']},
 {'text': '@Ladyshayla8 Beautiful',
  'id': 1197841311434448896,
  'date': 'Fri Nov 22 11:36:38 +0000 2019',
  'user': 'banks_mikado',
  'has

POS tag each word using NLTK   
Resources:
- [Categorizing and Tagging Words](https://www.nltk.org/book/ch05.html)
- [NLTK tags](https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/)

Noun tags:
- NN noun, singular 'desk'
- NNS noun plural 'desks'
- NNP proper noun, singular 'Harrison'
- NNPS proper noun, plural 'Americans'

In [15]:
def isEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
        return True if s.isalpha() else False
    except UnicodeDecodeError:
        return False

In [16]:
# noun based word extraction
def word_hashtag_extraction_noun_based(tweet_dictionary):  
    text = tweet_dictionary['text']
    # skip over retweets
    if text[0:4] == 'RT @':
        return [], []
    text = casual_tokenize(tweet_dictionary['text'])
    result = nltk.pos_tag(text)
    
    hashtags = ['#' + s for s in tweet_dictionary['hashtags']]
    urls = tweet_dictionary['urls']
    user_mentions = ['@' + s for s in tweet_dictionary['user_mentions']]
    ignore = hashtags + urls + user_mentions
    
    all_noun_tags = ['NN', 'NNS', 'NNP', 'NNPS']
    proper_noun_tags = ['NNP', 'NNPS']
    
    cleaned_text = {}
    for t in result:
        word = t[0]
        tag = t[1]
        if (tag in all_noun_tags) and (not word in ignore) and isEnglish(word):
            if tag in proper_noun_tags:
                cleaned_text[word.lower()] = PROPER_NOUN_WORD_TYPE 
            else:
                cleaned_text[word.lower()] = REGULAR_NOUN_WORD_TYPE 
            
    return cleaned_text, set([h.lower() for h in hashtags])

In [17]:
test_tweet = tweets_data[2]
print(test_tweet)
words, tags = word_hashtag_extraction_noun_based(test_tweet)
print(words)
print(tags)

{'text': '@Pinkbee_ Happy birthday to you miss', 'id': 1198372948207448065, 'date': 'Sat Nov 23 22:49:10 +0000 2019', 'user': 'banks_mikado', 'hashtags': [], 'urls': [], 'user_mentions': ['Pinkbee_']}
{'birthday': 0}
set()


In [18]:
def get_freq_map(my_list): 
    freq = {} 
    for item in my_list: 
        if (item in freq): 
            freq[item] += 1
        else: 
            freq[item] = 1
    return freq

In [19]:
word_count = get_freq_map(words)
print(word_count)
hashtag_count = get_freq_map(tags)
print(hashtag_count)

{'birthday': 1}
{}


In [20]:
for w,c in word_count.items():
    print(w,c)

birthday 1


In [21]:
def write_tweets_words_hashtags_to_db(tweets_dictionary):
    for tweet in tweets_dictionary:
        follower = tweet['user']
        tweet_id = tweet['id']
        tweet_date = tweet['date']
        sql = 'INSERT INTO tweets VALUES ({},\'{}\',\'{}\');'.format(tweet_id, follower, tweet_date)

        # if writing tweet to database was sucessful, then we need to get write word count info
        if write_try(sql):
            words, tags = word_hashtag_extraction_noun_based(tweet)
            word_count = get_freq_map(words)
            hashtag_count = get_freq_map(tags)
            
            for w, c in word_count.items():
                word_type_int = words[w]
                sql2 = 'INSERT INTO words VALUES ({},\'{}\',{},{});'.format(tweet_id, w, c, word_type_int)
                write_try(sql2)

            for w, c in hashtag_count.items():
                sql2 = 'INSERT INTO hashtags VALUES ({},\'{}\',{});'.format(tweet_id, w, c)
                write_try(sql2)

In [22]:
write_tweets_words_hashtags_to_db(tweets_data)

In [23]:
all_tweets = read_all_tweets_from_db()
print("All tweets in tweets DB")
print(all_tweets)

all_words = read_all_words_from_db()
print("All words in words DB")
print(all_words)

all_hashtags = read_all_hashtags_from_db()
print("All hashtags in hashtags DB")
print(all_hashtags)

All tweets in tweets DB
                tweet_id follower_handle          tweet_date
0                      0  test_follower1 1977-01-08 04:05:06
1                      1  test_follower1 1999-01-08 04:05:06
2    1198375711133884416    banks_mikado 2019-11-23 23:00:08
3    1198375272934002688    banks_mikado 2019-11-23 22:58:24
4    1198372948207448065    banks_mikado 2019-11-23 22:49:10
..                   ...             ...                 ...
155  1196826202759036930      bluesrider 2019-11-19 16:22:57
156  1196824103430180864      bluesrider 2019-11-19 16:14:36
157  1196736413305196545      bluesrider 2019-11-19 10:26:09
158  1195792542735880192      bluesrider 2019-11-16 19:55:33
159  1195769858455035905      bluesrider 2019-11-16 18:25:25

[160 rows x 3 columns]
All words in words DB
                tweet_id        word  count  word_type
0    1198375711133884416    birthday      1          0
1    1198372948207448065    birthday      1          0
2    1197841311434448896   beauti

In [29]:
print("List of words that followers of {} talk about".format(user))
read_user_words_from_db(user, limit = WORD_FREQ_LIMIT)

List of words that followers of ewarren talk about


Unnamed: 0,word,sum_count
0,birthday,5
1,sexy,4
2,law,4
3,desk,3
4,police,3
5,hii,3
6,wa,2
7,marefu,2
8,washington,2
9,nice,2


In [35]:
print("List of proper noun topics that followers of {} talk about".format(user))
read_user_proper_words_from_db(user, limit = WORD_FREQ_LIMIT)

List of proper noun topics that followers of ewarren talk about


Unnamed: 0,word,sum_count
0,birthday,5
1,sexy,4
2,desk,3
3,police,3
4,senior,2
5,director,2
6,washington,2
7,happy,2
8,doj,2
9,nice,2


In [36]:
print("List of hashtags that followers of {} use".format(user))
read_user_hashtags_from_db(user, limit = WORD_FREQ_LIMIT)

List of hashtags that followers of ewarren use


Unnamed: 0,word,sum_count
0,#npr,19
1,#job,19
2,#washdc,17
3,#ucorrupt,8
4,#editor,4
5,#shanghai,3
6,#correspondent,2
7,#director,2
8,#producer,2
9,#engagement,1
