In [1]:
## Importing libraries
import pickle
import pandas as pd
import numpy as np
import csv
import nltk
# from nltk.corpus import stopwords
from nltk.stem import *
from nltk.tokenize import MWETokenizer
import math
from nltk.util import ngrams
# nltk.download('stopwords')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [29]:
## Reading in the data
df1 = pd.read_pickle('testing_data_static_2022-04-10.pkl') # extract pkl file 1
df2 = pd.read_pickle('testing_data_update_2022-04-10_2023-04-10.pkl') # extract pkl file 2
testing_data = pd.concat([df1,df2],ignore_index=True)
testing_data = testing_data.sort_values(by='authoredAt').reset_index(drop=True)
testing_data.to_pickle('updated_testing_data.pkl')
sm_df = testing_data

In [24]:
## Sentiment Analysis (VADER)
# authoredAt column datetime manipulation for timeseries grouping
sm_df['authoredAt'] = pd.to_datetime(sm_df['authoredAt'])
sm_df['authoredAt'] = sm_df['authoredAt'].dt.date.astype('datetime64[ns]')
sm_df['weekAuthored'] = sm_df['authoredAt'].dt.isocalendar().week

platform_list = sm_df['platform'].unique()
analyzer = SentimentIntensityAnalyzer()

sm_df['negative'] = None
sm_df['positive'] = None
sm_df['compound'] = None
sm_df['sentiment'] = None

index = len(sm_df) - 1
while index >= 0:
    timeNotValid = False
    sentimentNotValid = False
    
    if pd.isnull(sm_df.at[index, 'weekAuthored']) or not isinstance(sm_df.at[index, 'authoredAt'], pd.Timestamp):
        # Check if 'weekAuthored' is null or 'authoredAt' is not of datetime type
        # If any of the conditions are true, update the values
        sm_df.at[index, 'authoredAt'] = pd.to_datetime(sm_df.at[index, 'authoredAt'], errors='coerce')
        sm_df.at[index, 'authoredAt'] = sm_df.at[index, 'authoredAt'].date().astype('datetime64[ns]')
        timeNotValid = True
    
    if (sm_df.at[index, 'negative'] is None) or (sm_df.at[index, 'positive'] is None) \
       or (sm_df.at[index, 'neutral'] is None) or (sm_df.at[index, 'compound'] is None):
        text = sm_df.at[index, 'content']
        sm_df.at[index, 'sentiment'] = analyzer.polarity_scores(text)
        sm_df.at[index, 'negative'] = sm_df.at[index, 'sentiment']['neg']
        sm_df.at[index, 'positive'] = sm_df.at[index, 'sentiment']['pos']
        sm_df.at[index, 'neutral'] = sm_df.at[index, 'sentiment']['neu']
        sm_df.at[index, 'compound'] = sm_df.at[index, 'sentiment']['compound']
        sentimentNotValid = True

    if not timeNotValid and not sentimentNotValid:
        break
        
    index -= 1
    
# One-Hot Encoding Account Labels
unique_values = set(val for sublist in sm_df['labels'] for val in sublist)
# print(unique_values)
for value in unique_values:
    sm_df[value] = sm_df['labels'].apply(lambda x: 1 if value in x else 0)

In [25]:
stopWords = {
    "'ll", "'tis", "'twas", "'ve", "10", "39", "a", "a's", "able", "ableabout", "about", "above", "abroad",
    "abst", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj",
    "adopted", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against",
    "ago", "ah", "ahead", "ai", "ain't", "aint", "al", "all", "allow", "allows", "almost", "alone", "along",
    "alongside", "already", "also", "although", "always", "am", "amid", "amidst", "among", "amongst", "amoungst",
    "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything",
    "anyway", "anyways", "anywhere", "ao", "apart", "apparently", "appear", "appreciate", "appropriate",
    "approximately", "aq", "ar", "are", "area", "areas", "aren", "aren't", "arent", "arise", "around", "arpa",
    "as", "aside", "ask", "asked", "asking", "asks", "associated", "at", "au", "auth", "available", "aw", "away",
    "awfully", "az", "b", "ba", "back", "backed", "backing", "backs", "backward", "backwards", "bb", "bd", "be",
    "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "began", "begin",
    "beginning", "beginnings", "begins", "behind", "being", "beings", "believe", "below", "beside", "besides",
    "best", "better", "between", "beyond", "bf", "bg", "bh", "bi", "big", "bill", "billion", "biol", "bj", "bm",
    "bn", "bo", "both", "bottom", "br", "brief", "briefly", "bs", "bt", "but", "buy", "bv", "bw", "by", "bz",
    "c", "c'mon", "c's", "ca", "call", "came", "can", "can't", "cannot", "cant", "caption", "case", "cases",
    "cause", "causes", "cc", "cd", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "ck", "cl", "clear",
    "clearly", "click", "cm", "cmon", "cn", "co", "co.", "com", "come", "comes", "computer", "con", "concerning",
    "consequently", "consider", "considering", "contain", "containing", "contains", "copy", "corresponding",
    "could", "could've", "couldn", "couldn't", "couldnt", "course", "cr", "cry", "cs", "cu", "currently", "cv",
    "cx", "cy", "cz", "d", "dare", "daren't", "darent", "date", "de", "dear", "definitely", "describe", "described",
    "despite", "detail", "did", "didn", "didn't", "didnt", "differ", "different", "differently", "directly", "dj",
    "dk", "dm", "do", "does", "doesn", "doesn't", "doesnt", "doing", "don", "don't", "done", "dont", "doubtful",
    "down", "downed", "downing", "downs", "downwards", "due", "during", "dz", "e", "each", "early", "ec", "ed",
    "edu", "ee", "effect", "eg", "eh", "eight", "eighty", "either", "eleven", "else", "elsewhere", "empty", "end",
    "ended", "ending", "ends", "enough", "entirely", "er", "es", "especially", "et", "et-al", "etc", "even",
    "evenly", "ever", "evermore", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly",
    "example", "except", "f", "face", "faces", "fact", "facts", "fairly", "far", "farther", "felt", "few", "fewer",
    "ff", "fi", "fifteen", "fifth", "fifty", "fify", "fill", "find", "finds", "fire", "first", "five", "fix", "fj",
    "fk", "fm", "fo", "followed", "following", "follows", "for", "forever", "former", "formerly", "forth", "forty",
    "forward", "found", "four", "fr", "free", "from", "front", "full", "fully", "further", "furthered",
    "furthering", "furthermore", "furthers", "fx", "g", "ga", "gave", "gb", "gd", "ge", "general", "generally",
    "get", "gets", "getting", "gf", "gg", "gh", "gi", "give", "given", "gives", "giving", "gl", "gm", "gmt", "gn",
    "go", "goes", "going", "gone", "good", "goods", "got", "gotten", "gov", "gp", "gq", "gr", "great", "greater",
    "greatest", "greetings", "group", "grouped", "grouping", "groups", "gs", "gt", "gu", "gw", "gy", "h", "had",
    "hadn", "hadn't", "hadnt", "half", "happens", "hardly", "has", "hasn", "hasn't", "hasnt", "have", "haven",
    "haven't", "havent", "having", "he", "he'd", "he'll", "he's", "hed", "hell", "hello", "help",     "hence", "her", "here", "here's", "hereafter", "hereby", "herein", "heres", "hereupon", "hers", "herself",
    "herse", "hes", "hi", "hid", "high", "higher", "highest", "him", "himself", "himse", "his", "hither", "hk",
    "hm", "hn", "home", "homepage", "hopefully", "how", "how'd", "how'll", "how's", "howbeit", "however", "hr",
    "ht", "htm", "html", "http", "hu", "hundred", "i", "i'd", "i'll", "i'm", "i've", "i.e.", "id", "ie", "if",
    "ignored", "ii", "il", "ill", "im", "immediate", "immediately", "importance", "important", "in", "inasmuch",
    "inc", "inc.", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "inside",
    "insofar", "instead", "int", "interest", "interested", "interesting", "interests", "into", "invention",
    "inward", "io", "iq", "ir", "is", "isn", "isn't", "isnt", "it", "it'd", "it'll", "it's", "itd", "itll",
    "its", "itself", "itse", "ive", "j", "je", "jm", "jo", "join", "jp", "just", "k", "ke", "keep", "keeps",
    "kept", "keys", "kg", "kh", "ki", "kind", "km", "kn", "knew", "know", "known", "knows", "kp", "kr", "kw",
    "ky", "kz", "l", "la", "large", "largely", "last", "lately", "later", "latest", "latter", "latterly", "lb",
    "lc", "least", "length", "less", "lest", "let", "let's", "lets", "li", "like", "liked", "likely", "likewise",
    "line", "little", "lk", "ll", "long", "longer", "longest", "look", "looking", "looks", "low", "lower", "lr",
    "ls", "lt", "ltd", "lu", "lv", "ly", "m", "ma", "made", "mainly", "make", "makes", "making", "man", "many",
    "may", "maybe", "mayn't", "maynt", "mc", "md", "me", "mean", "means", "meantime", "meanwhile", "member",
    "members", "men", "merely", "mg", "mh", "microsoft", "might", "might've", "mightn", "mightn't", "mightnt",
    "mil", "mill", "million", "mine", "minus", "miss", "mk", "ml", "mm", "mn", "mo", "more", "moreover", "most",
    "mostly", "move", "mp", "mq", "mr", "mrs", "ms", "msie", "mt", "mu", "much", "mug", "must", "must've",
    "mustn", "mustn't", "mustnt", "mv", "mw", "mx", "my", "myself", "myse", "mz", "n", "na", "name", "namely",
    "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needed", "needing",
    "needn't", "neednt", "needs", "neither", "net", "netscape", "never", "neverf", "neverless", "nevertheless",
    "new", "newer", "newest", "next", "nf", "ng", "ni", "nine", "ninety", "nl", "no", "no-one", "nobody", "non",
    "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "notwithstanding",
    "novel", "now", "nowhere", "np", "nr", "nu", "null", "number", "numbers", "nz", "o", "obtain", "obtained",
    "obviously", "of", "off", "often", "oh", "ok", "okay", "old", "older", "oldest", "om", "omitted", "on", "once",
    "one", "one's", "ones", "only", "onto", "open", "opened", "opening", "opens", "opposite", "or", "ord", "order",
    "ordered", "ordering", "orders", "org", "other", "others", "otherwise", "ought", "oughtn't", "oughtnt", "our",
    "ours", "ourselves", "out", "outside", "over", "overall", "owing", "own", "p", "pa", "page", "pages", "part",
    "parted", "particular", "particularly", "parting", "parts", "past", "pe", "per", "perhaps", "pf", "pg", "ph",
    "pk", "pl", "place", "placed", "places", "please", "plus", "pm", "pmid", "pn", "point", "pointed", "pointing",
    "points", "poorly", "possible", "possibly", "potentially", "pp", "pr", "predominantly", "present",
    "presented", "presenting", "presents", "presumably", "previously", "primarily", "probably", "problem",
    "problems", "promptly", "proud", "provided", "provides", "pt", "put", "puts", "pw", "py", "q", "qa", "que",
    "quickly", "quite", "qv", "r", "ran", "rather", "rd", "re", "readily", "really", "reasonably", "recent",
    "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "reserved",
    "respectively", "resulted", "resulting", "results", "right", "ring", "ro", "room", "rooms", "round", "ru",
    "run", "s", "sa", "said", "same", "saw", "say", "saying", "says", "sb", "sc", "sd", "se", "sec", "second", "secondly",
    "seconds", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "sees", "self", "selves",
    "sensible", "sent", "serious", "seriously", "seven", "seventy", "several", "sg", "sh", "shall", "shan't",
    "shant", "she", "she'd", "she'll", "she's", "shed", "shell", "shes", "should", "should've", "shouldn",
    "shouldn't", "shouldnt", "show", "showed", "showing", "shown", "showns", "shows", "si", "side", "sides",
    "significant", "significantly", "similar", "similarly", "since", "sincere", "site", "six", "sixty", "sj",
    "sk", "sl", "slightly", "sm", "small", "smaller", "smallest", "sn", "so", "some", "somebody", "someday",
    "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon",
    "sorry", "specifically", "specified", "specify", "specifying", "sr", "st", "state", "states", "still", "stop",
    "strongly", "su", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure",
    "sv", "sy", "system", "sz", "t", "t's", "take", "taken", "taking", "tc", "td", "tell", "ten", "tends", "test",
    "text", "tf", "tg", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "that's", "that've", "thatll",
    "thats", "thatve", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "there'd",
    "there'll", "there're", "there's", "there've", "thereafter", "thereby", "thered", "therefore", "therein",
    "therell", "thereof", "therere", "theres", "thereto", "thereupon", "thereve", "these", "they", "they'd",
    "they'll", "they're", "they've", "theyd", "theyll", "theyre", "theyve", "thick", "thin", "thing", "things",
    "think", "thinks", "third", "thirty", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh",
    "thought", "thoughts", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "til", "till",
    "tip", "tis", "tj", "tk", "tm", "tn", "to", "today", "together", "too", "took", "top", "toward", "towards",
    "tp", "tr", "tried", "tries", "trillion", "truly", "try", "trying", "ts", "tt", "turn", "turned", "turning",
    "turns", "tv", "tw", "twas", "twelve", "twenty", "twice", "two", "tz", "u", "ua", "ug", "uk", "um", "un",
    "under", "underneath", "undoing", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "up",
    "upon", "ups", "upwards", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually",
    "uucp", "uy", "uz", "v", "va", "value", "various", "vc", "ve", "versus", "very", "vg", "vi", "via", "viz",
    "vn", "vol", "vols", "vs", "vu", "w", "want", "wanted", "wanting", "wants", "was", "wasn", "wasn't", "wasnt",
    "way", "ways", "we", "we'd", "we'll", "we're", "we've", "web", "webpage", "website", "wed", "welcome", "well",
    "wells", "went", "were", "weren", "weren't", "werent", "weve", "wf", "what", "what'd", "what'll", "what's",
    "what've", "whatever", "whatll", "whats", "whatve", "when", "when'd", "when'll", "when's", "whence",
    "whenever", "where", "where'd", "where'll", "where's", "whereafter", "whereas", "whereby", "wherein", "wheres",
    "whereupon", "wherever", "whether", "which", "whichever", "while", "whilst", "whim", "whither", "who",
    "who'd", "who'll", "who's", "whod", "whoever", "whole", "wholl", "whom", "whomever", "whos", "whose", "why",
    "why'd", "why'll", "why's", "widely", "width", "will", "willing", "wish", "with", "within", "without", "won",
    "won't", "wonder", "wont", "words", "work", "worked", "working", "works", "world", "would", "would've",
    "wouldn", "wouldn't", "wouldnt", "ws", "www", "x", "y", "ye", "year", "years", "yes", "yet", "you", "you'd",
    "you'll", "you're", "you've", "youd", "youll", "young", "younger", "youngest", "your", "youre", "yours",
    "yourself", "yourselves", "youve", "yt", "yu", "z", "za", "zero", "zm", "zr"}

In [26]:
# Controller function to generate TF-IDF Matrix
def generate_matrix(sentences, documents):
    sentences = nltk.sent_tokenize(text) # NLTK function
    total_documents = documents

    freq_matrix = _create_frequency_matrix(sentences)
    tf_matrix = _create_tf_matrix(freq_matrix)
    documents_per_words = _create_documents_per_words(freq_matrix)
    idf_matrix = _create_idf_matrix(freq_matrix, documents_per_words, total_documents)
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    
    return tf_idf_matrix

# Create word frequency matrix for documents
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    ps = SnowballStemmer("english")

    for sent in sentences:
        freq_table = {}
        words = nltk.word_tokenize(sent)
        dictionary_tokenizer = MWETokenizer(words, separator=' ') 
        dictionary_based_token = dictionary_tokenizer.tokenize(words) 

        for word in words:
            word = word.lower()
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1
                
        # Adding bigrams as phrases
        bigrams = list(nltk.bigrams(words))
        for bigram in bigrams:
            phrase = ' '.join(bigram)
            phrase_words = phrase.split(' ')
            if all(word not in stopWords for word in phrase_words):
                if phrase in freq_table:
                    freq_table[phrase] += 1
                else:
                    freq_table[phrase] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

# Create TF (text frequency) matrix for documents
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

# Find number of documents per words
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

# Create IDF (inverse document frequency) matrix for documents
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            if float(count_doc_per_words[word]) == 0 or total_documents == 0:
                idf_table[word] = 0.0
            else:
                idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

# TF-IDF = TF * IDF matrices
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [39]:
# sm_df['actualText'] = ''

for index, row in sm_df.iterrows():
    if (row['actualText'] == ''):
        if (row['translate']) == 'True' :
            sm_df.at[index, 'actualText'] = row['text_translated']
        else:
            sm_df.at[index, 'actualText'] = row['content']

# Use weekly-text dataframe to generate TF-IDF matrices
# weekly_text = sm_df.groupby([pd.Grouper(key='authoredAt', freq='W')])['actualText'].agg(
#     text_combined=' '.join,  # Aggregating text as before
#     count='count'  # Adding count aggregation for number of posts
# ).reset_index()

weekly_text = weekly_text.rename(columns={'authoredAt': 'weekAuthored'})
weekly_text = weekly_text.rename(columns={'text_combined': 'textProcessed'})

weekly_text['tfIdfMatrix'] = [{} for _ in range(len(weekly_text))]

text = weekly_text.at[0, 'textProcessed']
count = weekly_text.at[0,'count']

for index, row in weekly_text[::-1].iterrows():
    text = weekly_text.at[index, 'textProcessed']
    count = weekly_text.at[index,'count']
    
    if row['tfIdfMatrix'] == {}:
        matrix = generate_matrix(text, count)
        # print(matrix)
        
        for dictionary in matrix.values():
            weekly_text.at[index, 'tfIdfMatrix'] = dictionary
    else:
        break
        
weekly_text.to_pickle('weekly_tf_idf.pkl')

In [41]:
sm_df.to_pickle('updated_testing_data.pkl')
weekly_text.to_pickle('weekly_tf_idf.pkl')

In [62]:
weekly_df = pd.read_pickle('weekly_tf_idf.pkl')
weekly_df['values'] = [[] for _ in range(len(weekly_text))]
more_stopwords = [
    'covid',
    'coronavirus',
    'corona',
    'rona',
    'covid-19',
    'tested',
    'testing',
    'test',
    'tests',
    'symptoms',
    'positive',
    'negative',
    'para',
    'vaccine',
    'vaccines',
    'vaccinated',
    'virus'
    'tests',
    'people',
    'health',
    'pandemic',
    'virus',
    'sars-cov-2',
    'doctor',
    'covid19',
    'vaccination',
    'vaccinations',
    'rt @',
    '-- --',
    '',
    "I'm", 
    r'\u']
    
# filtered_list = list(set(sorted_dict).difference(stopwords))
    
for index, row in weekly_df.iterrows():
    # Remove one-character words or strange numbers..
    items = row['tfIdfMatrix'].items()
    matrix_modified = {}
    for key, value in items:  
        if not key.isdigit() and len(key) > 3:
            # Remove stopwords
            words = key.split()
            if all(word not in more_stopwords for word in words):
                matrix_modified[key] = value

    weekly_df.at[index, 'tfIdfMatrix'] = matrix_modified

    sorted_df = pd.DataFrame.from_dict(matrix_modified, orient='index', columns=['tfIdfValue'])
    sorted_df = sorted_df.reset_index()
    sorted_df = sorted_df.rename(columns={'index' : 'keyword'})
    sorted_df = sorted_df.sort_values(by='tfIdfValue', ascending=False)
    sorted_dict = sorted_df['keyword'].tolist()
    
    for item in more_stopwords:
        if item in sorted_dict:
            sorted_dict.remove(item)
    
    sorted_dict = [item for item in sorted_dict if item]
    
    sorted_dict = sorted_dict[:20] # keep only the first 20 values
    # print(sorted_dict)
    weekly_text.at[index, 'values'] = sorted_dict
    
weekly_text.to_pickle('weekly_tf_idf.pkl')

weekly_text.head(10)

Unnamed: 0,weekAuthored,textProcessed,count,tfIdfMatrix,values
0,2021-05-02 00:00:00+00:00,LATEST: At least five people test positive for...,39,"{'georgia': 0.19780092313970737, 'covid-19': 0...","[updates, Georgia COVID-19, COVID-19 Updates, ..."
1,2021-05-09 00:00:00+00:00,“His life should be a testament to us of just ...,247,"{'1st': 0.05504386730514959, 'dose': 0.0392001...","[mothersday2021 #, 1st dose, “ Hospitals, mass..."
2,2021-05-16 00:00:00+00:00,"Does anyone else experience symptoms of a uti,...",209,"{'godbless': 0.07733820953703514, 'start': 0.0...","[godbless, notification, mspears96 That, @ msp..."
3,2021-05-23 00:00:00+00:00,At last my uncle tested negative he was in the...,221,"{'#': 0.11032557167329178, 'believehavefaith':...","[believehavefaith, godisahealer, weightloss, t..."
4,2021-05-30 00:00:00+00:00,Pfizer-BioNTech and Moderna are underway with...,211,"{'#': 0.012449963103581173, 'college': 0.02942...","[college, MIRROR SOURCE, FoxNews https, : //ww..."
5,2021-06-06 00:00:00+00:00,Puerto Rico ended a nightly pandemic curfew af...,213,"{'https': 0.11004567061101883, ':': 0.08291315...","[//t.co/ly0vx50zka, : //t.co/ly0vx50ZKa, https..."
6,2021-06-13 00:00:00+00:00,I am asking this question for a friend. My fri...,197,"{'hear': 0.19934362304976116, 'comment': 0.199...","[I hear, comment ,, I posted, hear, comment, I..."
7,2021-06-20 00:00:00+00:00,Today was one of those days when everything hu...,245,"{'stories': 0.13425123616507406, 'background':...","[//abcn.ws/3iwfsmp, FDA policy, policy :, : //..."
8,2021-06-27 00:00:00+00:00,@JennyErikson Contact your insurance provider....,203,"{'experiencing': 0.487465999299652, 'doctor': ...",[experiencing]
9,2021-07-04 00:00:00+00:00,A doctor in Rhode Island faces thousands of do...,206,"{'gente': 0.01131475413383449, 'por': 0.009131...","[como, humana, escencia, pero, nuestra escenci..."


In [47]:
list(sm_df['author'].unique())

['Survivor Corps',
 'Hispanic Health Coalition of Georgia, Inc',
 'Stand for Health Freedom',
 'A Voice for Choice',
 'COVID-19 Novel Coronavirus FACTS',
 'Fulton County Board of Health',
 '¡MÉDICOS POR LA VERDAD!',
 'COVID-19 Long Haulers Support',
 'U.S. Department of Health and Human Services',
 "Skip Mason's Vanishing Black Atlanta History",
 'CDC',
 'MedLink Georgia',
 'News Medical',
 'WebMD',
 'Georgia Family Planning System',
 'The Prayer Wall',
 '41NBC / WMGT',
 'FOX 5 Atlanta',
 'MedCura Health',
 'ABC News',
 'Black Educators',
 'Doctor Mike',
 'Clark Atlanta University',
 'Southwest Georgia Public Health District',
 'Georgia Coalition for Vaccine Choice',
 'GNR Public Health',
 'Clila- Coalición De Líderes Latinos',
 'Georgia Department of Public Health',
 'KISS 104.1',
 'Phoebe Putney Health System',
 'Gayle King',
 'Verywell',
 'Vaccines save lives',
 'El Profe Perulero Radio',
 'VICE',
 'Prensa Atlanta',
 'Fair Count',
 'Breitbart',
 'Covid Wellness Clinic',
 'Medical Ne

In [63]:
words_df = pd.DataFrame(weekly_text['values'].tolist(), index=weekly_text['weekAuthored'])

words_df = words_df.reset_index()
words_df.to_pickle('keywords.pkl')

words_df = pd.read_pickle('keywords.pkl')