<a href="https://colab.research.google.com/github/asanth7/Minority-USMedia-Representation/blob/main/Independent_Dataset_Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Data Loading and Processing



In [None]:
!pip install goose3 mediacloud

In [2]:
import requests
import mediacloud.api
import pandas as pd
import numpy as np
from datetime import date, datetime
# from newspaper import Article
from goose3 import Goose

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Ensure article dataset is loaded in current working directory and change name as appropriate

dataset_name = "mc_articles"

mc_articles_text = pd.read_csv(dataset_name + ".csv")
mc_articles_text.attrs['name'] = dataset_name
print(len(mc_articles_text))
mc_articles_text.head()

In [None]:
def preprocess_df(df):
  # All NaN/null values in 'text' column --> error in scraping
  null_count = df['text'].isnull().sum().sum()
  assert null_count == df.isnull().sum().sum()
  print('Number of null values:', null_count)

  df.dropna(inplace = True)
  df.reset_index(drop = True, inplace = True)
  df.drop(['url'], axis = 1, inplace = True)
  print(len(df))
  if "publish_date" in df.columns:
    df['publish_date'] = pd.to_datetime(df['publish_date']).dt.date
  print(df.info(verbose=True))

  return df

In [None]:
mc_articles_text = preprocess_df(mc_articles_text)

In [None]:
import datetime
from datetime import date

def subset_articles(end_date, name, start_date=date(2022, 1, 1), df=mc_articles_text):
  df_dates = df[(df['publish_date'] >= start_date) & (df['publish_date'] <= end_date)]
  subset_df = df_dates.drop(['publish_date'], axis = 1)
  subset_df.reset_index(drop = True, inplace = True)
  subset_df.attrs['name'] = name
  return subset_df

In [None]:
# Can optionally subset data to specific years, if desired.

# mc_articles_2022 = subset_articles(end_date = date(2022, 12, 31), name = "mc_articles_2022")
# mc_articles_2023 = subset_articles(end_date = date(2023, 12, 31), start_date = date(2023, 1, 1), name = "mc_articles_2023")
# mc_articles_2024 = subset_articles(end_date = date.today(), start_date = date(2024, 1, 1), name = "mc_articles_2024")

In [None]:
from matplotlib import pyplot as plt

def plot_publishers_articles(df):

  file_ext = df.attrs['name'].split("_")[-2]

  top_15_publishers = df['media_name'].value_counts().head(15)
  top_15_publishers.plot(kind='bar')
  plt.title("Top 15 Publishers by Article Count")
  plt.xlabel("Publisher")
  plt.ylabel("Number of Articles")
  plt.xticks(rotation=45, ha = "right")
  plt.tight_layout()
  plt.savefig(f"top_15_publishers_{file_ext}.png")
  plt.show()

  if 'publish_date' in df.columns:
    df['publish_date'] = pd.to_datetime(df['publish_date'])
    articles_daily = df.groupby(df['publish_date'].dt.date).size()
    articles_daily.plot(kind='line')
    plt.title("Number of Articles per Publish Date")
    plt.xlabel("Date")
    plt.ylabel("Article Count")
    plt.xticks(rotation=45, ha = "right")
    plt.tight_layout()
    plt.savefig(f"articles_daily_{file_ext}.png")
    plt.show()

In [None]:
# View bar chart of articles per publisher and graph of daily article frequency from data

plot_publishers_articles(mc_articles_text)

###Non-Negative Matrix Factorization

In [None]:
from time import time

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import LatentDirichletAllocation, NMF, MiniBatchNMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = 15000, stop_words = 'english')
tfidf = tfidf_vectorizer.fit_transform(mc_articles_text['text'])

In [None]:
# Experiment with NMF hyperparameters

num_components = 10
max_iterations = 7500
l1_ratio = 0.5

nmf_model = NMF(
    n_components = num_components,
    random_state = 2024,
    init = "nndsvda",
    max_iter = max_iterations,
    #beta_loss = "kullback-leibler",
    solver = "mu",
    alpha_W = 0.00005, # 0.00005
    l1_ratio = 0.5, # 0.5 best
    verbose = 1
)

nmf_model.fit(tfidf)

In [None]:
# Slightly adapted from the sklearn documentation

def plot_top_words(model_obj, features, num_top_words, loss_used=None, type="NMF"):
  fig, axes = plt.subplots(2, 5, figsize = (30, 15), sharex = True)
  axes = axes.flatten()

  topics_and_features = {}
  for topic_index, topic in enumerate(model_obj.components_):
    top_feature_indices = topic.argsort()[-num_top_words:]
    top_features = [features[x] for x in top_feature_indices]
    weights = topic[top_feature_indices]

    topic_bar_axis = axes[topic_index]
    topic_bar_axis.barh(top_features, weights, height=0.7)
    topics_and_features[topic_index] = top_features
    topic_bar_axis.set_title(f"Topic {topic_index + 1}", fontdict = {"fontsize": 30})
    topic_bar_axis.tick_params(axis = "both", which = "major", labelsize = 20)
    for i in ["top", "right", "left"]:
      topic_bar_axis.spines[i].set_visible(False)
  #f"Topics in NMF Model with {loss_used} Loss"

  if (type == "NMF"):
    sep = loss_used.split("-")
    if (len(sep) > 1):
      loss_used = "-".join([sep[0].capitalize(), sep[1].capitalize()])
    fig.suptitle(f"Topics in NMF Model with {loss_used} Loss", fontsize = 40)
  else:
    fig.suptitle("Topics in LDA Model", fontsize = 40)
  plt.subplots_adjust(bottom = 0.05, wspace = 0.90, hspace = 0.3)
  #plt.savefig("NMF_top_words.png")
  plt.show()

  return topics_and_features

In [None]:
num_top_words = 20
tfidf_features = tfidf_vectorizer.get_feature_names_out()
topics_and_features = plot_top_words(nmf_model, tfidf_features, num_top_words, nmf_model.beta_loss.capitalize())

In [None]:
nmf_components = nmf_model.components_
total_weight = nmf_components.sum()
feature_frequencies = 100 * nmf_components.sum(axis=0) / total_weight

feature_freq_dict = {tfidf_features[i]: feature_frequencies[i] for i in range(len(tfidf_features))}

all_sorted_features = sorted(feature_freq_dict.items(), key=lambda x: x[1], reverse=True)

print(all_sorted_features[:20])

In [None]:
feature_freq_df = pd.DataFrame(feature_freq_dict.items(), columns = ['Feature', 'Relative Weight']).sort_values(['Relative Weight'], ascending = False)
#feature_freq_df['Num_Articles'] = round(len(mc_articles_text) * feature_freq_df['Relative Weight'] / 100)
feature_freq_df.reset_index(drop = True, inplace = True)
#feature_freq_df = feature_freq_df[feature_freq_df['Num_Articles'] > 0]
feature_freq_df.head(15)

In [None]:
import itertools

feature_occurrences = np.sum(tfidf, axis = 0)
words_summed_tfidf = {tfidf_features[i]: feature_occurrences[0, i] for i in range(len(tfidf_features))}
sorted_words = sorted(words_summed_tfidf.items(), key = lambda x: x[1], reverse=True)
words_summed_tfidf_sorted = {word: tfidf_score for word, tfidf_score in sorted_words}
print(list(words_summed_tfidf_sorted)[:15])

total_tfidf = feature_occurrences.sum()
word_perc_tfidf = {word: (tfidf_score / total_tfidf) * 100 for word, tfidf_score in words_summed_tfidf_sorted.items()}
list(word_perc_tfidf)[:15]

In [None]:
tf_vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, max_features = 15000, stop_words = 'english')
tf = tf_vectorizer.fit_transform(mc_articles_text['text'])

In [None]:
word_occurences = (tf.toarray() > 0).astype(int).sum(axis = 0)

word_occurence_dict = dict(zip(tfidf_features, word_occurences))
sorted_word_occurences = sorted(word_occurence_dict.items(), key = lambda x: x[1], reverse=True)
word_occurence_dict = {word: count for word, count in sorted_word_occurences if count > 0}
list(word_occurence_dict)[:10]

In [None]:
feature_freq_df['Num_Articles'] = feature_freq_df['Feature'].map(word_occurence_dict)
feature_freq_df['tf-idf_score'] = feature_freq_df['Feature'].map(words_summed_tfidf_sorted)
feature_freq_df['Percent_Article_Occurence'] = 100 * round((feature_freq_df['Num_Articles'] / len(mc_articles_text)), 4)
feature_freq_df.head(15)

In [None]:
# Look at frequencies/in different time periods/intervals (i.e. in 2022, 2023, 2024)

###Latent Dirichlet Allocation (LDA)

In [None]:
!pip install pyLDAvis tmtoolkit plotly

In [None]:
from tmtoolkit.topicmod.evaluate import metric_coherence_gensim

In [None]:
import pyLDAvis.lda_model
pyLDAvis.enable_notebook(local=True)

def run_LDA_analysis(df, text_type='text', max_iterations=50, num_top_words=20):

  print(f"Running model on {df.attrs['name']}")

  tf_vectorizer = CountVectorizer(max_df = 0.95, min_df = 2, max_features = 10000, stop_words = 'english')
  tf = tf_vectorizer.fit_transform(df[text_type])

  lda_model = LatentDirichletAllocation(
      n_components = 15,
      max_iter = max_iterations,
      random_state = 2024,
      learning_method = "online",
      verbose = 1
  )

  lda_model.fit(tf)

  topic_word_distribution = lda_model.components_ / lda_model.components_.sum(axis=1)[:, np.newaxis]
  # print(topic_word_distribution.shape)

  analyzer_func = tf_vectorizer.build_analyzer()
  tokenized_articles = [analyzer_func(article) for article in df[text_type]]
  # print(len(tokenized_articles))

  sorted_vocabulary = sorted(tf_vectorizer.vocabulary_.items(), key = lambda x: x[1])
  words = [pair[0] for pair in sorted_vocabulary]

  # print(np.array(words).shape)

  print("Calculating coherence score...")
  coherence_score = metric_coherence_gensim(measure="u_mass", topic_word_distrib=topic_word_distribution,
                                            dtm=tf, vocab=np.array(words),
                                            texts=tokenized_articles)
  dict_coherence = {i: coherence_score[i - 1] for i in range(1, len(coherence_score) + 1)}
  sorted_coherence = sorted(dict_coherence.items(), key = lambda x: x[1])
  print(sorted_coherence)

  topic_dists_per_doc = lda_model.transform(tf) # _vectorizer.transform(df[text_type])

  prepared_data = pyLDAvis.lda_model.prepare(lda_model, tf, tf_vectorizer)
  pyLDAvis.display(prepared_data)

  return lda_model, tf_vectorizer, tf, topic_dists_per_doc

In [None]:
mc_articles_text.attrs['name'] = "mc_articles_text"
all_years_LDA, all_years_vectorizer, tf_all, topic_dists_all = run_LDA_analysis(mc_articles_text, text_type="text")

In [None]:
from IPython.display import display, HTML

# Plot interactive LDA intertopic distance visualization

def plot_save_LDA(model, tf_file, vect, title):
  prepared_data = pyLDAvis.lda_model.prepare(model, tf_file, vect)
  pyLDAvis.save_html(prepared_data, title)

  return display(HTML(title))

In [None]:
plot_save_LDA(all_years_LDA, tf_all, all_years_vectorizer, "LDA_15topics_all_years_titles.html")

In [None]:
# Example number-topic mapping to match LDA numerical topics with semantic meaning

# nums_to_topics_all = {
#     2: "Film",
#     (6, 11): "American/Asian Culture + Heritage/Identity",
#     (4, 13): "Proliferation of Violence/Hate + Minority Insecurity",
#     (1, 12): "Foreign Policy, Govt, Politics",
#     (8, 15): "Affirmative Action and Education",
#     9: "Social Culture/Events",
#     10: "Health, Data and Science",
#     7: "Economy, Businesses, and Tech"
# }

In [None]:
def clean_rename_df(df, nums_to_topics):
  for item in nums_to_topics.items():
    if isinstance(item[0], int):
      print("Renaming " + f"Topic_{item[0]}" + " to " + str(item[1]))
      df.rename(columns = {f"Topic_{item[0]}": item[1]}, inplace = True)
    else:
      cols = [f"Topic_{i}" for i in item[0]]
      df[item[1]] = sum([df[x] for x in cols])

  return df

In [None]:
def topic_dist_by_publisher(df, publisher, nums_to_topics):
  labels = list(nums_to_topics.values())
  labels.append("Counts")
  sizes = list(df.loc[df['media_name'] == publisher][labels].values[0])
  article_count = sizes[len(labels) - 1]
  labels.pop()
  sizes.pop()
  sizes.append(1 - sum(sizes))
  labels.append("Other")

  year = "20" + df.attrs['name'][-2:]

  fig1, ax1 = plt.subplots(figsize=(8, 5))
  wedges, text, autotexts = ax1.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=90)
  plt.title(f"{publisher} Topic Distribution {year} (n = {round(article_count)})")
  plt.savefig(f"{publisher}_topic_dist_{year}.png", bbox_inches="tight")

In [None]:
def analyze_publishers(model, df, topic_distribution, id2topic, publisher_list): #, publisher_list = list(mc_articles_text['media_name'].value_counts().index)[:3]
  print("Topic distribution shape: " + str(topic_distribution.shape))

  topic_cols = [f"Topic_{i}" for i in range (1, model.components_.shape[0] + 1)]
  for idx, name in enumerate(topic_cols):
    df[name] = topic_distribution[:, idx]

  publisher_groups = df.groupby(['media_name'])[topic_cols].mean()
  publisher_groups.insert(0, 'media_name', publisher_groups.index)
  publisher_groups['Counts'] = df['media_name'].value_counts(sort = False)
  publisher_groups = publisher_groups.sort_values(by = "Counts", ascending = False)
  publisher_groups.reset_index(drop = True, inplace = True)

  publisher_groups = clean_rename_df(publisher_groups, id2topic)

  for publisher in publisher_list:
    topic_dist_by_publisher(publisher_groups, publisher, id2topic)

In [None]:
# Example function calls to visualize pie-chart topic distributions by publisher

# analyze_publishers(mc_articles_text, topic_dists_all, nums_to_topics_all)
# analyze_publishers(model22, mc_articles_2022, dists22, nums_topics_22)
# analyze_publishers(model23, mc_articles_2023, dists23, nums_topics_23)
# analyze_publishers(model24, mc_articles_2024, dists24, nums_topics_24, publisher_list=list(mc_articles_2024['media_name'].value_counts().index)[:3])
# analyze_publishers(model_black, mc_articles_black_24, dists_black, num_topics_black_24, publisher_list=list(mc_articles_black_24['media_name'].value_counts().index)[:3])
# analyze_publishers(model_hl, mc_articles_hl_24, dists_hl, num_topics_hl_24)