In [2]:
from compute_tf_idf import get_tf_idf_scores_scipy
import pandas as pd

In [4]:
all_data = pd.read_csv("../data/articles/final_annotated_articles.tsv", sep='\t')

In [26]:
def get_combined_titles_descs(df):
    combined = (
        df['title'].fillna("")  # replace NaN with empty string
        .astype(str)
        .str.cat(df['description'].fillna("").astype(str), sep=" ")
    )
    return combined.tolist()
    
def sort_tf_idf(word_scores):
    sorted_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
    return sorted_scores

all_titles_descs = get_combined_titles_descs(all_data)

overall_top_words = sort_tf_idf(get_tf_idf_scores_scipy(all_titles_descs))
print(overall_top_words)



In [28]:
def tf_idf_by_sentiment(df, top_k=20):
    """
    Returns a dict: sentiment_value -> list[(term, score)] sorted desc
    """
    results = {}
    for sentiment, group in df.groupby("sentiment"):
        docs = get_combined_titles_descs(group)
        scores = get_tf_idf_scores_scipy(docs)
        results[sentiment] = sort_tf_idf(scores)[:top_k]
    return results

tf_idf_by_sentiment(all_data, 20)

{'Negative': [('carney', np.float64(8.579404842301246)),
  ('mark', np.float64(5.79440792559065)),
  ('minister', np.float64(5.768020030004651)),
  ('prime', np.float64(5.686478651970398)),
  ('canada', np.float64(5.613098473843107)),
  ('trump', np.float64(5.139519935882419)),
  ('government', np.float64(3.589294100091361)),
  ('budget', np.float64(3.5074661108463485)),
  ('says', np.float64(3.267008453901207)),
  ('trade', np.float64(2.9971188150516634)),
  ('talks', np.float64(2.82162713872561)),
  ('pm', np.float64(2.7950199824226383)),
  ('federal', np.float64(2.679092076138803)),
  ('projects', np.float64(2.510645266875255)),
  ('new', np.float64(2.369296118214888)),
  ('pipeline', np.float64(2.257258725346222)),
  ('eby', np.float64(2.22981129230537)),
  ('president', np.float64(2.2176212929840626)),
  ('ottawa', np.float64(2.1493419532526756)),
  ('policy', np.float64(2.126283633179033))],
 'Neutral': [('carney', np.float64(19.479482091321124)),
  ('canada', np.float64(14.76600

In [29]:
def tf_idf_by_source(df, top_k=20):
    """
    Returns dict: source -> list[(term, score)]
    """
    results = {}
    for src, group in df.groupby("source"):
        docs = get_combined_titles_descs(group)
        scores = get_tf_idf_scores_scipy(docs)
        results[src] = sort_tf_idf(scores)[:top_k]
    return results

tf_idf_by_source(all_data)

{'cbc.ca': [('carney', np.float64(11.499351131784662)),
  ('minister', np.float64(7.93660604736766)),
  ('prime', np.float64(7.5769085464127475)),
  ('mark', np.float64(7.534597623980839)),
  ('government', np.float64(5.4835976289971535)),
  ('canada', np.float64(5.251607786811516)),
  ('projects', np.float64(5.045640276139908)),
  ('trump', np.float64(4.888468257184666)),
  ('says', np.float64(4.621341463836952)),
  ('new', np.float64(4.371687459046735)),
  ('budget', np.float64(4.314681004634196)),
  ('president', np.float64(3.9330289014509265)),
  ('trade', np.float64(3.5545505296619067)),
  ('federal', np.float64(3.372983835791544)),
  ('week', np.float64(2.9834062131820422)),
  ('major', np.float64(2.9082715308101066)),
  ('nation', np.float64(2.57248884274671)),
  ('tariffs', np.float64(2.5530470491901136)),
  ('summit', np.float64(2.55182310509176)),
  ('premier', np.float64(2.5505674834327383))],
 'citynews.ca': [('carney', np.float64(2.5339999323561027)),
  ('budget', np.float

In [30]:
def tf_idf_by_topic(df, top_k=20):
    """
    Returns dict: open_coding_topic -> list[(term, score)]
    """
    results = {}
    for topic, group in df.groupby("open_coding_topic"):
        docs = get_combined_titles_descs(group)
        scores = get_tf_idf_scores_scipy(docs)
        results[topic] = sort_tf_idf(scores)[:top_k]
    return results

tf_idf_by_topic(all_data)

{'Election': [('canada', np.float64(2.7381595870610975)),
  ('trump', np.float64(2.4687768416842593)),
  ('election', np.float64(2.3539031376210082)),
  ('carney', np.float64(2.2457486827661732)),
  ('mark', np.float64(2.0321711504799005)),
  ('minister', np.float64(1.63229092320026)),
  ('prime', np.float64(1.63229092320026)),
  ('president', np.float64(1.5619701415138423)),
  ('poilievre', np.float64(1.4021674810164921)),
  ('liberal', np.float64(1.3036388441115108)),
  ('party', np.float64(1.2424669194681797)),
  ('pierre', np.float64(1.2201846371264709)),
  ('anti', np.float64(1.1955512164816098)),
  ('tariffs', np.float64(1.150665307633171)),
  ('leader', np.float64(1.14009757021838)),
  ('threats', np.float64(1.1108533669048057)),
  ('canadian', np.float64(1.0934039223464793)),
  ('lead', np.float64(0.9658525419269974)),
  ('political', np.float64(0.9283963405312813)),
  ('new', np.float64(0.9129889443816125))],
 'International': [('carney', np.float64(17.457200974130824)),
  ('c

In [34]:
def tf_idf_for_topic_and_sentiment(df, topic, top_k=20):
    """
    For a given open_coding_topic, returns dict:
       "positive" -> list[(term, score)]
       "negative" -> list[(term, score)]
    """
    topic_df = df[df["open_coding_topic"] == topic]
    results = {}

    for sentiment, group in topic_df.groupby("sentiment"):
        docs = get_combined_titles_descs(group)
        scores = get_tf_idf_scores_scipy(docs)
        results[sentiment] = sort_tf_idf(scores)[:top_k]

    return results
topic = "National"
National_words_by_sentiment = tf_idf_for_topic_and_sentiment(all_data, topic)
print(National_words_by_sentiment)

{'Negative': [('carney', np.float64(4.2276095141487495)), ('budget', np.float64(3.088199556267111)), ('minister', np.float64(3.029668449330432)), ('canada', np.float64(2.928257333622755)), ('prime', np.float64(2.922557264260019)), ('mark', np.float64(2.8975303996803805)), ('government', np.float64(2.801938185818867)), ('federal', np.float64(2.19651025199126)), ('projects', np.float64(2.1544021411321013)), ('eby', np.float64(2.1058617037086447)), ('pipeline', np.float64(2.0652765875538326)), ('new', np.float64(1.80838097300822)), ('public', np.float64(1.7394397819120793)), ('major', np.float64(1.6135563659542993)), ('policy', np.float64(1.5516859422154237)), ('says', np.float64(1.5383476329419146)), ('premier', np.float64(1.4315803059368952)), ('foreign', np.float64(1.4274266291708062)), ('feminist', np.float64(1.2977898733738578)), ('claim', np.float64(1.2403829156302937))], 'Neutral': [('carney', np.float64(8.157404830983078)), ('projects', np.float64(6.115375257554473)), ('minister',

In [43]:
def _normalize_scores(scores_dict):
    if not scores_dict:
        return {}
    max_val = max(scores_dict.values())
    if max_val == 0:
        return {k: 0.0 for k in scores_dict}
    return {k: v / max_val for k, v in scores_dict.items()}

def divisive_words_for_topic(df, topic, top_k=20):
    """
    Returns the words that are salient in BOTH positive and negative articles
    for a given topic, sorted by a 'divisiveness' score.
    """
    topic_df = df[df["open_coding_topic"] == topic]
    pos_df = topic_df[topic_df["sentiment"] == "Positive"]
    pos_docs = get_combined_titles_descs(
        pos_df
    )
    neg_docs = get_combined_titles_descs(
        topic_df[topic_df["sentiment"] == "Negative"]
    )
    pos_scores = get_tf_idf_scores_scipy(pos_docs)
    neg_scores = get_tf_idf_scores_scipy(neg_docs)

    pos_norm = _normalize_scores(pos_scores)
    neg_norm = _normalize_scores(neg_scores)

    common_terms = set(pos_norm.keys()) & set(neg_norm.keys())

    divisive_scores = {
        term: min(pos_norm[term], neg_norm[term])  # high in both
        for term in common_terms
    }

    return sort_tf_idf(divisive_scores)[:top_k]

National_divisive_words = divisive_words_for_topic(all_data, topic="National", top_k=30)
print(National_divisive_words)

[('carney', np.float64(1.0)), ('prime', np.float64(0.6500054215261272)), ('minister', np.float64(0.6500054215261272)), ('mark', np.float64(0.6464656726381828)), ('government', np.float64(0.5468558982955354)), ('projects', np.float64(0.5096029171856713)), ('budget', np.float64(0.4587955396311343)), ('canada', np.float64(0.4432604620787548)), ('new', np.float64(0.42775496813412456)), ('major', np.float64(0.38167109818305844)), ('says', np.float64(0.36388120231857995)), ('federal', np.float64(0.36127867901257893)), ('pipeline', np.float64(0.359555235100017)), ('premier', np.float64(0.33862642733340315)), ('pm', np.float64(0.28752145616880614)), ('alberta', np.float64(0.2690460176830479)), ('ottawa', np.float64(0.2641916249929423)), ('nation', np.float64(0.24796339083269867)), ('national', np.float64(0.22198306002978782)), ('talks', np.float64(0.21671585867799661)), ('canadian', np.float64(0.20685046263620158)), ('build', np.float64(0.19239248880394197)), ('list', np.float64(0.182873437510