# Check basic statistics of manually selected topics
**Objective**: make sure manually selected topics have high quality.  
 - Characteristic keywords: easy to regonize associated news.
 - Amout of disscussion: reasonable size of associated news and tweets
 - Consistent in meaning: no drift/disperse in content.
 - Evolution of event: reasonable time-span of associated news.
 
Last modified: 2017-10-18

# Roadmap
1. Manually compile a list of topics with keywords
2. Check number of associated news and tweets for each topic
3. Check news titles and sample tweets of each topic
4. Check time-span of each topic

# Steps

In [2]:
"""
Initialization
"""

'''
Standard modules
'''
import os
import pickle
import sqlite3
import time
from pprint import pprint

'''
Analysis modules
'''
import pandas as pd


'''
Custom modules
'''
import config
import utilities

'''
Misc
'''
nb_name = '20171011-daheng-check_topics_basic_statistics'

## Manually compile a list of topics with keywords
Topics information (category, name, keywords_lst) are manually compiled into config.MANUALLY_SELECTED_TOPICS_LST

In [2]:
"""
Print out manually selected topics information
"""
for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
    print('({}/{}) {}'.format(topic_ind+1, len(config.MANUALLY_SELECTED_TOPICS_LST), topic))

(1/51) {'category': 'politics', 'name': 'Hillary_Clinton_email_controversy', 'keywords_lst': [('email', 'e-mail'), ('Hillary', 'Clinton')]}
(2/51) {'category': 'politics', 'name': 'Iran_nuclear_deal', 'keywords_lst': ['Iran', 'nuclear']}
(3/51) {'category': 'politics', 'name': 'ISIS_Jihadi_John_identity_reveal', 'keywords_lst': ['Jihadi John']}
(4/51) {'category': 'politics', 'name': 'Ukraine_cease_fire', 'keywords_lst': [('cease-fire', 'ceasefire'), ('Ukraine', 'Russia')]}
(5/51) {'category': 'politics', 'name': 'Egypt_free_Al_Jazeera_journalist', 'keywords_lst': [('Al Jazeera', 'Egypt'), ('Peter Greste', 'journalist')]}
(6/51) {'category': 'politics', 'name': 'Keystone_XL_Pipeline_bill', 'keywords_lst': ['Keystone XL']}
(7/51) {'category': 'politics', 'name': 'CIA_Torture_Report', 'keywords_lst': ['Torture Report']}
(8/51) {'category': 'politics', 'name': 'Obama_cybersecurity_plan', 'keywords_lst': ['Obama', 'cyber']}
(9/51) {'category': 'politics', 'name': 'DHS_funding_issue', 'keyw

## Check number of associated news and tweets for each topic

### Build pickle for news_id and tweets_id associated with each topic

In [3]:
%%time
"""
Register
    TOPICS_LST_PKL = os.path.join(DATA_DIR, 'topics.lst.pkl')
in config.
"""
if 0 == 1:
    supplement_topics_lst = []
    
    '''
    Load in pickle for news data over selected period.
    '''
    news_period_df = pd.read_pickle(config.NEWS_PERIOD_DF_PKL)
 
    for topic_ind, topic in enumerate(config.MANUALLY_SELECTED_TOPICS_LST):
        localtime = time.asctime(time.localtime(time.time()))
        print('({}/{}) processing topic: {} ... {}'.format(topic_ind+1,
                                                           len(config.MANUALLY_SELECTED_TOPICS_LST),
                                                           topic['name'],
                                                           localtime))
        '''
        Match out associated news titles.
        '''
        asso_news_native_ids_lst = []
        for ind, row in news_period_df.iterrows():
            if utilities.news_title_match(row['news_title'], topic['keywords_lst'], verbose=False):
                asso_news_native_ids_lst.append(row['news_native_id'])
        
        topic['news_native_ids_lst'] = asso_news_native_ids_lst
        
        '''
        Query associated tweets
        '''
        asso_tweets_ids_lst = []
        
        query_news_tweets = '''
        select tweet_id from tweets
        where news_native_id = :news_native_id
        order by tweet_id asc;'''
        
        with sqlite3.connect(config.NEWS_TWEETS_DB_FILE) as conn:
            cursor = conn.cursor()
            for news_native_id in topic['news_native_ids_lst']:
                cursor.execute(query_news_tweets, {'news_native_id': news_native_id})
                tweets_ids_lst = [item[0] for item in cursor.fetchall()]
                asso_tweets_ids_lst.extend(tweets_ids_lst)
                
        topic['tweets_ids_lst'] = asso_tweets_ids_lst
        
        supplement_topics_lst.append(topic)
    
    '''
    Make pickle
    '''
    with open(config.TOPICS_LST_PKL, 'wb') as f:
        pickle.dump(supplement_topics_lst, f)

(1/51) processing topic: Hillary_Clinton_email_controversy ... Wed Oct 18 17:01:55 2017
(2/51) processing topic: Iran_nuclear_deal ... Wed Oct 18 17:02:03 2017
(3/51) processing topic: ISIS_Jihadi_John_identity_reveal ... Wed Oct 18 17:02:14 2017
(4/51) processing topic: Ukraine_cease_fire ... Wed Oct 18 17:02:22 2017
(5/51) processing topic: Egypt_free_Al_Jazeera_journalist ... Wed Oct 18 17:02:30 2017
(6/51) processing topic: Keystone_XL_Pipeline_bill ... Wed Oct 18 17:02:38 2017
(7/51) processing topic: CIA_Torture_Report ... Wed Oct 18 17:02:45 2017
(8/51) processing topic: Obama_cybersecurity_plan ... Wed Oct 18 17:02:53 2017
(9/51) processing topic: DHS_funding_issue ... Wed Oct 18 17:03:01 2017
(10/51) processing topic: US_Cuba_relationship ... Wed Oct 18 17:03:08 2017
(11/51) processing topic: 2015_CPAC ... Wed Oct 18 17:03:17 2017
(12/51) processing topic: Iraq_free_ISIS_Tikrit ... Wed Oct 18 17:03:25 2017
(13/51) processing topic: Nigeria_Boko_Haram_terrorists ... Wed Oct 18 

### Recover pickle and print number of news and tweets for each topic

In [4]:
"""
Test recover topics lst pkl
"""
if 0 == 1:
    with open(config.TOPICS_LST_PKL, 'rb') as f:
        topics_lst = pickle.load(f)
    
    for topic_ind, topic in enumerate(topics_lst):
        print('{} Topic_name: {}; news_num: {}; tweets_num: {}'.format(topic_ind,
                                                                       topic['name'],
                                                                       len(topic['news_native_ids_lst']),
                                                                       len(topic['tweets_ids_lst'])))

0 Topic_name: Hillary_Clinton_email_controversy; news_num: 228; tweets_num: 860564
1 Topic_name: Iran_nuclear_deal; news_num: 406; tweets_num: 2412264
2 Topic_name: ISIS_Jihadi_John_identity_reveal; news_num: 101; tweets_num: 620121
3 Topic_name: Ukraine_cease_fire; news_num: 84; tweets_num: 603709
4 Topic_name: Egypt_free_Al_Jazeera_journalist; news_num: 50; tweets_num: 129120
5 Topic_name: Keystone_XL_Pipeline_bill; news_num: 55; tweets_num: 117692
6 Topic_name: CIA_Torture_Report; news_num: 41; tweets_num: 167362
7 Topic_name: Obama_cybersecurity_plan; news_num: 73; tweets_num: 495576
8 Topic_name: DHS_funding_issue; news_num: 45; tweets_num: 104911
9 Topic_name: US_Cuba_relationship; news_num: 235; tweets_num: 1213314
10 Topic_name: 2015_CPAC; news_num: 68; tweets_num: 289774
11 Topic_name: Iraq_free_ISIS_Tikrit; news_num: 94; tweets_num: 567544
12 Topic_name: Nigeria_Boko_Haram_terrorists; news_num: 243; tweets_num: 954810
13 Topic_name: Ferguson_unrest; news_num: 611; tweets_num:

## Check news titles and sample tweets of each topic

In [3]:
"""
Recover pkl
"""
if 1 == 1:
    with open(config.TOPICS_LST_PKL, 'rb') as f:
        topics_lst = pickle.load(f)

In [4]:
"""
Select topic
"""
if 1 == 1:
    target_topic_ind = 26
    topic = topics_lst[target_topic_ind]

In [9]:
'''
Print associated news titles
'''
if 1 == 1:
    print('TOPIC: {}; KEYWORDS: {}'.format(topic['name'], topic['keywords_lst']))
    
    # limit to first 100 news
    news_native_ids_lst = topic['news_native_ids_lst'][:100]
    
    query_news = '''
    select news_title, news_collected_time from news
    where news_native_id = :news_native_id
    order by news_native_id asc;'''

    with sqlite3.connect(config.NEWS_TWEETS_DB_FILE) as conn:
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        for news_native_id in news_native_ids_lst:
            cursor.execute(query_news, {'news_native_id': news_native_id})
            for row in cursor.fetchall():
                print('{}: {}'.format(row['news_collected_time'], row['news_title']))

TOPIC: Hillary_Clinton_email_controversy; KEYWORDS: [('email', 'e-mail'), ('Hillary', 'Clinton')]
2015-03-02: Hillary Clinton used private email account for State Department business
2015-03-02: Hillary Clinton used personal email as Secretary of State
2015-03-02: Hillary Clinton used private email account for State Department business
2015-03-03: Hillary Clinton's Personal Email Use May Have Violated Federal Requirements ...
2015-03-03: Report: Clinton only used personal e-mail at State
2015-03-03: Hillary Clinton's use of private email address while Secretary of State draws scrutiny
2015-03-03: Hillary Clinton Used Personal Email Account at State Dept., Possibly Breaking Rules
2015-03-03: Clinton had no official State Dept. email address
2015-03-03: Clinton aide: State Department e-mails preserved
2015-03-03: Hillary Clinton illegally used a private email account, reports NYTimes (+video)
2015-03-03: Hillary Clinton used private e-mail for government business at State Dept.
2015-03-0

In [5]:
'''
Print associated tweets
'''
if 1 == 1:
    print('TOPIC: {}; KEYWORDS: {}'.format(topic['name'], topic['keywords_lst']))
    
    # limit to first 150 tweets
    tweets_ids_lst = topic['tweets_ids_lst'][:150]
    
    query_tweets = '''
    select tweet_text, tweet_collected_time from tweets
    where tweet_id = :tweet_id
    order by tweet_native_id asc;'''

    with sqlite3.connect(config.NEWS_TWEETS_DB_FILE) as conn:
        conn.row_factory = sqlite3.Row
        cursor = conn.cursor()
        for tweet_id in tweets_ids_lst:
            cursor.execute(query_tweets, {'tweet_id': tweet_id})
            for row in cursor.fetchall():
                print('{}: {}'.format(row['tweet_collected_time'], row['tweet_text']))

TOPIC: Super_Bowl; KEYWORDS: ['Super Bowl']
2014-11-23: Katy Perry to headline NFL's 2015 Super Bowl halftime show: NFL http://t.co/es7YFYORsj
2014-11-23: Katy Perry to perform at #SuperBowlHalftimeShow  @katyperry http://t.co/fJWgC1ihZB
2014-11-23: "@pepsi: It's official! @katyperry will perform at the Pepsi Super Bowl XLIX #Halftime Show: http://t.co/w7tIYHMW6M http://t.co/uTpO5bjvPm"
2014-11-23: Katy Perry Hyped for Super Bowl Halftime Show, Watch Her Sneak Peek Promo Video http://t.co/BMLq88gMyj
2014-11-23: “@nfl: .@katyperry will headline @pepsi @superbowl #halftime show: http://t.co/LpHf78vPSz http://t.co/8nLt3tv7aw” 😍😁 yaaa
2014-11-23: Katy Perry ‘Testing Out Ideas’ For 2015 Super Bowl Halftime Show Performance: Rumors about… http://t.co/jIS7yM9JAT
2014-11-23: #NFL finally announces #KatyPerry as next #SuperBowl halftime performer: Katy Perry, as long rumored, will headline… http://t.co/CosnhXgri5
2014-11-23: http://t.co/cVWnDmsz82 NFL Announces Katy Perry Will Be Super Bowl Hal