# Finding popular political hashtags from existing data

We want to find the most common hashtags that appear in the tweets we've gathered so far and add the politically relevant ones to our filter.

In [135]:
import numpy as np
import pandas as pd
import build_hashtag_count_dict as build
import pickle
import json
import os

## Create dict for top hashtags in history

In [142]:
directory = 'hashtag2count_dicts'
file_names = []
for file_name in os.listdir(directory):
    file_names.append(file_name)

In [163]:
hashtag_counts_historical = {}
for file_name in file_names:
    with open('{}/{}'.format(directory, file_name), 'rb') as f:
        data = pickle.load(f)
        f.close()
    for hashtag in data.keys():
        if hashtag in hashtag_counts_historical.keys():
            hashtag_counts_historical[hashtag] += data[hashtag]
        else:
            hashtag_counts_historical[hashtag] = data[hashtag]

In [164]:
hashtag_counts_historical

{'Trump': 2338536,
 'BLUE': 1158532,
 '生きろ': 3296685,
 'ゼロ一獲千金ゲーム': 1187468,
 'NEWS': 4027583,
 'OnlyYouぼくらのROMEOandJULIET': 14117,
 'hacking': 1163,
 'RussianHacking': 4366,
 'TCMA': 138,
 'vote': 115480,
 'Cascade': 439173,
 '夜よ踊れ': 523038,
 'NEWSな2人': 30631,
 'NowPlaying': 679647,
 'nowplaying': 92408,
 'Music': 21667,
 'Konpa': 2235,
 'Zouk': 2157,
 'Haiti': 5561,
 'USA': 1086804,
 'FBR': 107656,
 'Resistance': 289343,
 'FBRParty': 163527,
 'BlueWave': 290221,
 'DemForce': 12730,
 'news': 1346150,
 'popular': 2592,
 'MAGA': 4055406,
 'tbs': 46568,
 'tbs_news': 45630,
 'japan': 51247,
 'テンションMAXっす': 33,
 '小山慶一郎': 384033,
 '手越祐也': 402506,
 '増田貴久': 367467,
 '加藤シゲアキ': 563138,
 'RepresentNEWSMix': 282,
 'NEWSニッポン': 262849,
 'StandUp': 194195,
 'BuildtheWall': 10376,
 'MAGA2018': 23093,
 'Veterans': 174868,
 'Veteran': 38610,
 'QANON': 40053,
 'SOMETHINGBIG': 57,
 'Truth': 198827,
 'TeamLead': 74,
 'salary': 1061,
 'ScrumMaster': 11,
 'Houston': 15433,
 'Texas': 303960,
 'Dallas': 15548,

## Convert the dictionary to a list of tuples with (hashtag, count) format

In [152]:
def sort_dict_by_value_to_tuples(d, reverse=True):
    return [(k, d[k]) for k in sorted(d, key=d.get, reverse=reverse)]

In [171]:
hashtag_counts_historical_tuples = sort_dict_by_value_to_tuples(hashtag_counts_historical)
hashtag_counts_historical_tuples

[('MAGA', 4055406),
 ('NEWS', 4027583),
 ('生きろ', 3296685),
 ('Trump', 2338536),
 ('news', 1346150),
 ('ゼロ一獲千金ゲーム', 1187468),
 ('POTUS', 1179791),
 ('BLUE', 1158532),
 ('USA', 1086804),
 ('LVE', 960586),
 ('BackfireTrump', 857634),
 ('Strawberry', 828151),
 ('エンドレス・サマー', 795007),
 ('News', 697437),
 ('NowPlaying', 679647),
 ('加藤シゲアキ', 563138),
 ('夜よ踊れ', 523038),
 ('KAG', 498594),
 ('BringBacktheSummer', 477132),
 ('希望～Yell～', 461988),
 ('Cascade', 439173),
 ('手越祐也', 402506),
 ('QAnon', 400646),
 ('WalkAway', 390493),
 ('小山慶一郎', 384033),
 ('FBRparty', 371077),
 ('増田貴久', 367467),
 ('Obama', 344768),
 ('WWG1WGA', 343479),
 ('maga', 339447),
 ('GOP', 338408),
 ('truth', 322066),
 ('AmericaFirst', 321801),
 ('np', 319505),
 ('Texas', 303960),
 ('BlueWave', 290221),
 ('Resistance', 289343),
 ('Patriots', 283965),
 ('Vote', 272670),
 ('America', 267256),
 ('NEWSニッポン', 262849),
 ('trump', 245121),
 ('2A', 244130),
 ('Bring', 242875),
 ('希望〜Yell〜', 236318),
 ('BuildTheWall', 231414),
 ('TrumpTra

## Put the top 1000 hashtags that we aren't already filtering for in a file

In [174]:
def get_hashtags_not_in_lst(top_hashtags_and_counts):
    hashtag_lst = []
    with open('political_hashtags_2018.txt', 'r') as f:
        for line in f:
            hashtag_lst.append(line[1:-1].lower())
        f.close()
    hashtags_not_in_lst = []
    for hashtag,_ in top_hashtags_and_counts:
        if hashtag.lower() not in hashtag_lst:
            hashtags_not_in_lst.append(hashtag)
    return hashtags_not_in_lst

In [177]:
top_hashtags_not_in_lst = get_hashtags_not_in_lst(hashtag_counts_historical_tuples)

In [180]:
f = open('top_1000_hashtags_not_in_political_hashtags_historical.txt', 'w+')
for hashtag in top_hashtags_not_in_lst[:1000]:
    f.write(hashtag + '\n')
f.close()

# Work below here is only for single days

## Examine top hashtags

Figure out which hashtags appear with the top hashtags but aren't on our list

In [133]:
def get_top_hashtags_and_counts_for_file(file_name):
    with open(file_name, 'rb') as f:
        data = pickle.load(f)
        f.close()
    sorted_counts = list(data.keys())
    sorted_counts.sort()
    top_hashtags_and_counts = []
    for count in sorted_counts[-1000:]:
        top_hashtags_and_counts.append((count, data[count]))
    return top_hashtags_and_counts

In [134]:
get_top_hashtags_and_counts_for_file('counts2hashtag_2018-09-19.pkl')

[(1,
  ['TradeTarrifs',
   'Replay',
   'edlandscapes',
   'itswhatsontheoutsidethatcounts',
   'mudgee',
   'abattoir',
   'OvenStoveCleaners',
   'MuéveteEnMéxico',
   'militaryhalloffame',
   'bcbravehearts',
   'protector',
   '150Billion',
   'BankOfRussia',
   'Blacklist',
   'IMPEACHTRUMPNOW',
   'IncompetentPOTUS',
   'factoryfarming',
   'BlogPost',
   'NunesIsATraitor',
   'NunesIsALeaker',
   'VoteJanzforCongress',
   'TheDocumentsAreComing',
   'IGI',
   'steeldossier',
   'rEPUBLICAN',
   'ironicmemes',
   'UseYourHead',
   'guargopaldas',
   'goFast',
   'Walkawayfromdemocrats2018',
   'itsokwhenitsaconservative',
   'DIDDLERSDOWN',
   'MakeaDifference',
   'STEPSocal',
   'CrisisOfCharacter',
   'Beto4Veterans',
   'drugskill',
   'SomeDrugsArentCool',
   'NoSelfRespect',
   'chevelle',
   'liveatthenorva',
   'Creepy',
   'DemonratsAreSorosPaidPuppets',
   'OTC',
   'wusses',
   'betamales',
   'TearsDown',
   'TRAMPTRAIN',
   'TRUMPSTAIN',
   'CaitlynJenner',
   'VotoL

In [111]:
hashtag_lst = []
with open('political_hashtags_2018.txt', 'r') as f:
    for line in f:
        hashtag_lst.append(line[1:-1].lower())
    f.close()

In [112]:
hashtag_lst

['makeamericagreatagain',
 'dumptrump',
 'nevertrump',
 'donaldtrumpthemovie',
 'trumpsacrifices',
 'donaldtrumpwantstobanghisdaughter',
 'trumpyourcat',
 'trumpsajoke',
 'makedonalddrumpfagain',
 'lovetrumpshate',
 'boycotttrump',
 'trumptrain',
 'whentrumpiselected',
 'antitrump',
 'trump666',
 'battrump',
 'saferthanatrumprally',
 'trumpsopoor',
 'famousmelaniatrumpquotes',
 'trumpolympics',
 'faketrumpintelligencebriefing',
 'weakdonald',
 'sociopathinchief',
 'trumpdebateexcuses',
 'dumbdonald',
 'traitortrump',
 'maga',
 'trumptrain',
 'mikepence',
 'teaparty',
 'donaldjtrump',
 'donaldtrump',
 'defenddonald',
 'trump',
 'secondamendment',
 '2ndamendment',
 '2a',
 'defendthesecond',
 'onenationundergod',
 'righttobeararms',
 'donttreadonme',
 'red',
 'rightwing',
 'right',
 'conservatives',
 'prolife',
 'progod',
 'progun',
 'gunrights',
 'donaldtrump',
 'latinosfortrump',
 'gaysfortrump',
 'votetrump',
 'freemilo',
 'alllivesmatter',
 'buildthewall',
 'hillaryclinton',
 'obama',

Get popular hashtags that don't show up in our list of hashtags.

In [113]:
hashtags_not_in_lst = []
for _,hashtags in top_hashtags_and_counts:
    for hashtag in hashtags:
        if hashtag.lower() not in hashtag_lst:
            hashtags_not_in_lst.append(hashtag)

In [114]:
hashtag_lst

['makeamericagreatagain',
 'dumptrump',
 'nevertrump',
 'donaldtrumpthemovie',
 'trumpsacrifices',
 'donaldtrumpwantstobanghisdaughter',
 'trumpyourcat',
 'trumpsajoke',
 'makedonalddrumpfagain',
 'lovetrumpshate',
 'boycotttrump',
 'trumptrain',
 'whentrumpiselected',
 'antitrump',
 'trump666',
 'battrump',
 'saferthanatrumprally',
 'trumpsopoor',
 'famousmelaniatrumpquotes',
 'trumpolympics',
 'faketrumpintelligencebriefing',
 'weakdonald',
 'sociopathinchief',
 'trumpdebateexcuses',
 'dumbdonald',
 'traitortrump',
 'maga',
 'trumptrain',
 'mikepence',
 'teaparty',
 'donaldjtrump',
 'donaldtrump',
 'defenddonald',
 'trump',
 'secondamendment',
 '2ndamendment',
 '2a',
 'defendthesecond',
 'onenationundergod',
 'righttobeararms',
 'donttreadonme',
 'red',
 'rightwing',
 'right',
 'conservatives',
 'prolife',
 'progod',
 'progun',
 'gunrights',
 'donaldtrump',
 'latinosfortrump',
 'gaysfortrump',
 'votetrump',
 'freemilo',
 'alllivesmatter',
 'buildthewall',
 'hillaryclinton',
 'obama',

In [115]:
hashtags_not_in_lst

['TradeTarrifs',
 'Replay',
 'edlandscapes',
 'itswhatsontheoutsidethatcounts',
 'mudgee',
 'abattoir',
 'OvenStoveCleaners',
 'MuéveteEnMéxico',
 'militaryhalloffame',
 'bcbravehearts',
 'protector',
 '150Billion',
 'BankOfRussia',
 'Blacklist',
 'IMPEACHTRUMPNOW',
 'IncompetentPOTUS',
 'factoryfarming',
 'BlogPost',
 'NunesIsATraitor',
 'NunesIsALeaker',
 'VoteJanzforCongress',
 'TheDocumentsAreComing',
 'IGI',
 'steeldossier',
 'ironicmemes',
 'UseYourHead',
 'guargopaldas',
 'goFast',
 'Walkawayfromdemocrats2018',
 'itsokwhenitsaconservative',
 'DIDDLERSDOWN',
 'MakeaDifference',
 'STEPSocal',
 'CrisisOfCharacter',
 'Beto4Veterans',
 'drugskill',
 'SomeDrugsArentCool',
 'NoSelfRespect',
 'chevelle',
 'liveatthenorva',
 'Creepy',
 'DemonratsAreSorosPaidPuppets',
 'OTC',
 'wusses',
 'betamales',
 'TearsDown',
 'TRAMPTRAIN',
 'TRUMPSTAIN',
 'CaitlynJenner',
 'VotoLatino',
 'Nuthatch',
 'species',
 'AajKeSamachar',
 'unethical',
 'atlas',
 'スチーム',
 'valve',
 'Kiina',
 'Euroopan',
 'Lum

In [117]:
hashtags_not_in_lst.reverse()

In [118]:
hashtags_not_in_lst

['生きろ',
 'BackfireTrump',
 'Kavanaugh',
 'QAnon',
 'WWG1WGA',
 'LVE',
 'KAG',
 'ゼロ一獲千金ゲーム',
 'Strawberry',
 'DrainTheSwamp',
 'WalkAway',
 'BLUE',
 'StrayKids',
 'エンドレス・サマー',
 'choreography',
 'FBRParty',
 'PatriotsFight',
 'MeTooHucksters',
 'FollowBackResistance',
 'QAlert',
 'WednesdayWisdom',
 'ConfirmKavanaugh',
 'Resist',
 'Pennsylvania',
 'OPMAYFLOWER',
 'GOT7',
 'FISAGate',
 'GoodTRUMPsEvil',
 'MAGAbots',
 'Lullaby',
 'PresentYou',
 'Virginia',
 'Senator',
 'DigitalSoldiers',
 'Hirono',
 'Sexist',
 'Mazie',
 'NowPlaying',
 'np',
 'Louisiana',
 'AAUSA',
 'job',
 'ElectoralCollege',
 'FakeNews',
 'Democrats',
 'Qanon',
 'SCOTUS',
 '45sindespidos',
 '希望～Yell～',
 'ChristineBlaseyFord',
 '加藤シゲアキ',
 'QArmy',
 '小山慶一郎',
 'BringBacktheSummer',
 'FBI',
 'PatriotsUnited',
 'japan',
 'tcot',
 'NewYork',
 'WeThePeople',
 'tbs',
 'tbs_news',
 'FBRparty',
 'Trump2020',
 'BrettKavanaugh',
 'cbc',
 '手越祐也',
 'CareerArc',
 'TheGreatAwakening',
 'MeToo',
 'hiring',
 'DeepState',
 'China',
 'Missou

In [119]:
f = open('top_1000_hashtags_not_in_political_hashtags_2018_09-19.txt', 'w+')
for hashtag in hashtags_not_in_lst:
    f.write(hashtag + '\n')
f.close()