# Finding popular political hashtags from existing data

We want to find the most common hashtags that appear in the tweets we've gathered so far and add the politically relevant ones to our filter.

In [0]:
import numpy as np
import pandas as pd
import build_hashtag_count_dict as build
import pickle
import json
import os

## Create dict for top hashtags in history

In [0]:
directory = 'hashtag2count_dicts'
file_names = []
for file_name in os.listdir(directory):
    file_names.append(file_name)

In [0]:
hashtag_counts_historical = {}
for file_name in file_names:
    with open('{}/{}'.format(directory, file_name), 'rb') as f:
        data = pickle.load(f)
        f.close()
    for hashtag in data.keys():
        if hashtag in hashtag_counts_historical.keys():
            hashtag_counts_historical[hashtag] += data[hashtag]
        else:
            hashtag_counts_historical[hashtag] = data[hashtag]

In [0]:
hashtag_counts_historical

## Convert the dictionary to a list of tuples with (hashtag, count) format

In [0]:
def sort_dict_by_value_to_tuples(d, reverse=True):
    return [(k, d[k]) for k in sorted(d, key=d.get, reverse=reverse)]

In [0]:
hashtag_counts_historical_tuples = sort_dict_by_value_to_tuples(hashtag_counts_historical)
hashtag_counts_historical_tuples

## Put the top 1000 hashtags that we aren't already filtering for in a file

In [0]:
def get_hashtags_not_in_lst(top_hashtags_and_counts):
    hashtag_lst = []
    with open('political_hashtags_2018.txt', 'r') as f:
        for line in f:
            hashtag_lst.append(line[1:-1].lower())
        f.close()
    hashtags_not_in_lst = []
    for hashtag,_ in top_hashtags_and_counts:
        if hashtag.lower() not in hashtag_lst:
            hashtags_not_in_lst.append(hashtag)
    return hashtags_not_in_lst

In [0]:
top_hashtags_not_in_lst = get_hashtags_not_in_lst(hashtag_counts_historical_tuples)

In [0]:
f = open('top_1000_hashtags_not_in_political_hashtags_historical.txt', 'w+')
for hashtag in top_hashtags_not_in_lst[:1000]:
    f.write(hashtag + '\n')
f.close()

# Work below here is only for single days

## Examine top hashtags

Figure out which hashtags appear with the top hashtags but aren't on our list

In [0]:
def get_top_hashtags_and_counts_for_file(file_name):
    with open(file_name, 'rb') as f:
        data = pickle.load(f)
        f.close()
    sorted_counts = list(data.keys())
    sorted_counts.sort()
    top_hashtags_and_counts = []
    for count in sorted_counts[-1000:]:
        top_hashtags_and_counts.append((count, data[count]))
    return top_hashtags_and_counts

In [0]:
get_top_hashtags_and_counts_for_file('counts2hashtag_2018-09-19.pkl')

In [0]:
hashtag_lst = []
with open('political_hashtags_2018.txt', 'r') as f:
    for line in f:
        hashtag_lst.append(line[1:-1].lower())
    f.close()

In [0]:
hashtag_lst

Get popular hashtags that don't show up in our list of hashtags.

In [0]:
hashtags_not_in_lst = []
for _,hashtags in top_hashtags_and_counts:
    for hashtag in hashtags:
        if hashtag.lower() not in hashtag_lst:
            hashtags_not_in_lst.append(hashtag)

In [0]:
hashtag_lst

In [0]:
hashtags_not_in_lst

In [0]:
hashtags_not_in_lst.reverse()

In [0]:
hashtags_not_in_lst

In [0]:
f = open('top_1000_hashtags_not_in_political_hashtags_2018_09-19.txt', 'w+')
for hashtag in hashtags_not_in_lst:
    f.write(hashtag + '\n')
f.close()