In [None]:
from dotenv import load_dotenv
import os
from s3_utils import load_topics_by_year_df

load_dotenv()

DATASET_TYPE = os.environ["DATASET_TYPE"]
if DATASET_TYPE == "SAMPLE":
    SAVE_DIR = "outputs_sample"
elif DATASET_TYPE == "FULL":
    SAVE_DIR = "outputs_full"

In [None]:
import pandas as pd

def find_large_trending_topics(df, threshold_count=100, significant_change=0.5):
    """
    Finds topics with at least one yearly count >= threshold_count 
    and significant increase or decrease between 2013 and 2016.

    Parameters:
    - df: pandas DataFrame with columns ['year', 'topic_words', 'topic_id', 'count']
    - threshold_count: minimum yearly count to consider a topic "large"
    - significant_change: fraction change considered significant (e.g., 0.5 = 50%)

    Returns:
    - DataFrame of trending topics with columns:
      ['topic_words', 'topic_id', 'count_2013', 'count_2016', 'change', 'trend']
    """

    # Filter topics that have at least one yearly count >= threshold_count
    large_topics = df.groupby('topic_words').filter(lambda x: x['count'].max() >= threshold_count)

    # Pivot to have years as columns
    pivot = large_topics.pivot_table(index=['topic_words', 'topic_id'], 
                                     columns='year', values='count', fill_value=0)

    # Ensure 2013 and 2016 exist
    pivot = pivot.reindex(columns=[2013, 2016], fill_value=0)

    # Calculate change
    pivot['change'] = pivot[2016] - pivot[2013]
    pivot['percent_change'] = 100 * pivot['change'] / pivot[2013].replace(0, 1)  # avoid division by zero

    # Determine trend
    def trend_label(row):
        if row['percent_change'] >= significant_change:
            return 'increase'
        elif row['percent_change'] <= -significant_change:
            return 'decrease'
        else:
            return 'stable'

    pivot['trend'] = pivot.apply(trend_label, axis=1)

    # Keep only increasing or decreasing topics
    result = pivot[pivot['trend'].isin(['increase', 'decrease'])].reset_index()

    # Keep only relevant columns
    result = result[['topic_words', 'topic_id', 2013, 2016, 'change', 'percent_change', 'trend']]
    result = result.rename(columns={2013: 'count_2013', 2016: 'count_2016'})
    result.columns.name = ''

    return result

In [None]:
df_topics_by_year = load_topics_by_year_df()
df_trends = find_large_trending_topics(df_topics_by_year)
df_trends.loc[df_trends.trend == 'increase'].sort_values('percent_change', ascending=False).reset_index(drop=True)