In [1]:
## Importing libraries
import pickle
import pandas as pd
import numpy as np
import csv
import nltk
from nltk.corpus import stopwords
from nltk.stem import *
import math
# nltk.download('stopwords')

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [2]:
## Reading in the data
sm_df = pd.read_pickle('testing_data_vader.pkl')
weekly_text = pd.read_pickle('weekly_tf_idf.pkl')

In [156]:
## Sentiment Analysis (VADER)
# authoredAt column datetime manipulation for timeseries grouping
sm_df['authoredAt'] = pd.to_datetime(sm_df['authoredAt'])
sm_df['authoredAt'] = sm_df['authoredAt'].dt.date.astype('datetime64[ns]')
sm_df['weekAuthored'] = sm_df['authoredAt'].dt.isocalendar().week

platform_list = sm_df['platform'].unique()
analyzer = SentimentIntensityAnalyzer()

index = len(sm_df) - 1
while index >= 0:
    timeNotValid = False
    sentimentNotValid = False
    
    if pd.isnull(sm_df.at[index, 'weekAuthored']) or not isinstance(sm_df.at[index, 'authoredAt'], pd.Timestamp):
        # Check if 'weekAuthored' is null or 'authoredAt' is not of datetime type
        # If any of the conditions are true, update the values
        sm_df.at[index, 'authoredAt'] = pd.to_datetime(sm_df.at[index, 'authoredAt'], errors='coerce')
        sm_df.at[index, 'authoredAt'] = sm_df.at[index, 'authoredAt'].date().astype('datetime64[ns]')
        timeNotValid = True
    
    if (sm_df.at[index, 'negative'] is None) or (sm_df.at[index, 'positive'] is None) \
       or (sm_df.at[index, 'neutral'] is None) or (sm_df.at[index, 'compound'] is None):
        text = sm_df.at[index, 'content']
        sm_df.at[index, 'sentiment'] = analyzer.polarity_scores(text)
        sm_df.at[index, 'negative'] = sm_df.at[index, 'sentiment']['neg']
        sm_df.at[index, 'positive'] = sm_df.at[index, 'sentiment']['pos']
        sm_df.at[index, 'neutral'] = sm_df.at[index, 'sentiment']['neu']
        sm_df.at[index, 'compound'] = sm_df.at[index, 'sentiment']['compound']
        sentimentNotValid = True

    if not timeNotValid and not sentimentNotValid:
        break
        
    index -= 1

In [157]:
# Controller function to generate TF-IDF Matrix
def generate_matrix(sentences, documents):
    sentences = nltk.sent_tokenize(text) # NLTK function
    total_documents = documents

    freq_matrix = _create_frequency_matrix(sentences)
    tf_matrix = _create_tf_matrix(freq_matrix)
    documents_per_words = _create_documents_per_words(freq_matrix)
    idf_matrix = _create_idf_matrix(freq_matrix, documents_per_words, total_documents)
    tf_idf_matrix = _create_tf_idf_matrix(tf_matrix, idf_matrix)
    
    return tf_idf_matrix

# Create word frequency matrix for documents
def _create_frequency_matrix(sentences):
    frequency_matrix = {}
    stopWords = set(stopwords.words("english"))
    ps = SnowballStemmer("english")

    for sent in sentences:
        freq_table = {}
        words = nltk.word_tokenize(sent)
        for word in words:
            word = word.lower()
            # word = ps.stem(word)
            if word in stopWords:
                continue

            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

# Create TF (text frequency) matrix for documents
def _create_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

# Find number of documents per words
def _create_documents_per_words(freq_matrix):
    word_per_doc_table = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in word_per_doc_table:
                word_per_doc_table[word] += 1
            else:
                word_per_doc_table[word] = 1

    return word_per_doc_table

# Create IDF (inverse document frequency) matrix for documents
def _create_idf_matrix(freq_matrix, count_doc_per_words, total_documents):
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            if float(count_doc_per_words[word]) == 0 or total_documents == 0:
                idf_table[word] = 0.0
            else:
                idf_table[word] = math.log10(total_documents / float(count_doc_per_words[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

# TF-IDF = TF * IDF matrices
def _create_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                    f_table2.items()):  # here, keys are the same in both the table
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [158]:
# Use weekly-text dataframe to generate TF-IDF matrices
# weekly_text = sm_df.groupby([pd.Grouper(key='authoredAt', freq='W')])['text_processed'].agg(
#     text_combined=' '.join,  # Aggregating text as before
#     count='count'  # Adding count aggregation for number of posts
# ).reset_index()

weekly_text = weekly_text.rename(columns={'authoredAt': 'weekAuthored'})
weekly_text = weekly_text.rename(columns={'text_combined': 'textProcessed'})

text = weekly_text.at[0, 'textProcessed']
count = weekly_text.at[0,'count']

for index, row in weekly_text[::-1].iterrows():
    text = weekly_text.at[index, 'textProcessed']
    count = weekly_text.at[index,'count']
    
    if row['tfIdfMatrix'] is None:
        matrix = generate_matrix(text, count)
        # print(matrix)
        
        for dictionary in matrix.values():
            weekly_text.at[index, 'tfIdfMatrix'] = dictionary
    else:
        break

In [5]:
sm_df.to_pickle('testing_data_vader.pkl')
weekly_text.to_pickle('weekly_tf_idf.pkl')

weekly_text.tail(-10)

Unnamed: 0,weekAuthored,textProcessed,count,tfIdfMatrix
10,2021-07-11,my husband is 29 and had covid in december 202...,202,"{'husband': 0.004953010820009457, '29': 0.0005..."
11,2021-07-18,antigen testing identifies pcr-positive childr...,269,"{'antigen': 0.005310289101510848, 'testing': 0..."
12,2021-07-25,people are starting to wonder if the olympics ...,424,"{'people': 0.05055729590987039, 'starting': 0...."
13,2021-08-01,almost a year ago today when i got the news t...,428,"{'almost': 0.0051753766241989805, 'year': 0.01..."
14,2021-08-08,i thought i'd post some encouraging words i ha...,444,"{'thought': 0.008863484833863924, ''d': 0.0009..."
...,...,...,...,...
98,2023-03-19,i had covid again recently but i think i’m neg...,7,"{'covid': 0.10450137053939734, 'recently': 0.0..."
99,2023-03-26,two doctors explain how vaccines work the vacc...,1,"{'two': 0.0, 'doctors': 0.0, 'explain': 0.0, '..."
100,2023-04-02,,0,
101,2023-04-09,,0,
