In [101]:
import pandas as pd
import plotly.express as px
import plotly
import glob
import plotly.io as pio
pio.renderers.default = "notebook_connected"

plotly.offline.init_notebook_mode(connected=True)
plotly.io.templates.default = "plotly_white"

# Data Loading and Aggregating

In [2]:
DATA_PATH = "/home/pranavgoel/trans-fer-entropy/obtaining_news_collections/data/"

In [3]:
def load_and_convert(path):
    """load one dataset from the path and convert date to timestamp"""
    df = pd.read_csv(path)
    return df.assign(publish_date=pd.to_datetime(df["publish_date"]))

In [4]:
all_csv_paths = glob.glob(DATA_PATH + "*dedup*")
all_csv_paths = [csv_path for csv_path in all_csv_paths if ("nytimes" not in csv_path)]
all_csv_paths

['/home/pranavgoel/trans-fer-entropy/obtaining_news_collections/data/newyork_article_texts_and_info_dedup.csv',
 '/home/pranavgoel/trans-fer-entropy/obtaining_news_collections/data/florida_article_texts_and_info_dedup.csv',
 '/home/pranavgoel/trans-fer-entropy/obtaining_news_collections/data/illinois_article_texts_and_info_dedup.csv',
 '/home/pranavgoel/trans-fer-entropy/obtaining_news_collections/data/texas_article_texts_and_info_dedup.csv',
 '/home/pranavgoel/trans-fer-entropy/obtaining_news_collections/data/california_article_texts_and_info_dedup.csv',
 '/home/pranavgoel/trans-fer-entropy/obtaining_news_collections/data/ohio_article_texts_and_info_dedup.csv']

In [5]:
def num_articles_per_media(df, state, temporal=None):
    """Take a state df and then group by its media name and count the number of articles per media.
    You can also supply the temporal grouping here. Please use the use this parameter to specify the frequency of temporal grouping.
    See eligible values: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
    """

    if temporal is not None:
        num_news_per_media = (
            df.groupby([pd.Grouper(key="publish_date", freq=temporal), "media_name"])
            .size()
            .reset_index(name="count")
        )
        num_news_per_media["state"] = state
    else:
        num_news_per_media = df.groupby("media_name").size().reset_index(name="count")
        num_news_per_media["state"] = state

    return num_news_per_media

In [102]:
num_articles_per_media_df = []
num_articles_per_media_temporal_df = []
for file_path in all_csv_paths:
    state = file_path.split("/")[-1].split("_")[0]
    state_df = load_and_convert(file_path)
    

    num_total_media = state_df["media_name"].nunique()
    print(f"Loading files from {state}: {num_total_media} unique meida found")

    # getting # articles per media
    num_articles_per_media_df.append(num_articles_per_media(state_df, state))
    num_articles_master = pd.concat(num_articles_per_media_df)

    # getting daily # articles per media
    num_articles_per_media_temporal_df.append(
        num_articles_per_media(state_df, state, temporal="D")
    )
    num_articles_temporal_master = pd.concat(num_articles_per_media_temporal_df)

Loading files from newyork: 129 unique meida found
Loading files from florida: 72 unique meida found
Loading files from illinois: 63 unique meida found
Loading files from texas: 98 unique meida found
Loading files from california: 340 unique meida found
Loading files from ohio: 52 unique meida found


# Overall Distribution 

## Number of Articles per Media 

In [13]:
px.violin(
    num_articles_master,
    box=True,
    points="all",
    hover_data=num_articles_master.columns,
    color="state",
    height=900,
    x="count",
    y="state",
)

## Normalized Count

In [20]:
num_articles_master["perc"] = (num_articles_master["count"]) / (
    num_articles_master.groupby(["state"])["count"].transform("sum")
)

In [21]:
px.violin(
    num_articles_master,
    box=True,
    points="all",
    hover_data=num_articles_master.columns,
    color="state",
    height=900,
    x="perc",
    y="state",
)

In [27]:
px.histogram(
    num_articles_master,
    #   box=True,
    #   points='all',
    hover_data=num_articles_master.columns,
    histnorm="probability density",
    color="state",
    height=900,
    x="count",
    facet_row="state",
)

In [32]:
px.histogram(
    num_articles_master,
    #   box=True,
    #   points='all',
    hover_data=num_articles_master.columns,
    marginal="rug",
    histnorm="percent",
    color="state",
    barmode="overlay",
    height=900,
    x="count",
)

## Who are the those media? (Querying media with >0.1 prevelance)

In [45]:
px.bar(
    num_articles_master.query("perc >= 0.1"),
    y="media_name",
    facet_row="state",
    height=700,
    color="perc",
    hover_data=num_articles_master.columns,
    x="perc",
).update_yaxes(matches=None)

# Temporal Changes

## Number of News Daily

In [90]:
px.line(num_articles_temporal_master,
        x='publish_date',
        y='count',
        color='media_name',
        height=800,
        facet_row='state').update_layout(showlegend=False).update_yaxes(matches=None)

## Normalized Daily Count

In [95]:
num_articles_temporal_master_norm=num_articles_temporal_master.groupby(["state", "publish_date"]).apply(
    lambda x: x.assign(perc=x['count']/x['count'].sum())
).reset_index(drop=True)

In [97]:
num_articles_temporal_master_norm.head()

Unnamed: 0,publish_date,media_name,count,state,perc
0,2023-04-01,AlterNet,6,california,0.133333
1,2023-04-01,BuzzFeed Politics,1,california,0.022222
2,2023-04-01,Crooks and Liars,1,california,0.022222
3,2023-04-01,Daily Kos,1,california,0.022222
4,2023-04-01,Echo - United States - California,1,california,0.022222


In [100]:
px.area(num_articles_temporal_master_norm,
       x='publish_date',
       y='perc',
       color='media_name',
       height=900,
       facet_row='state').update_layout(showlegend=False)