NOTE: Topics are interpreted in `../topic_modeling/interpret_topic_modeling_output_prior_data_apr_jun_2023_after_national_outlet_filtering.ipynb`

In [26]:
import os
import random

import little_mallet_wrapper as lmw
import numpy as np
import pandas as pd
from sklearn import metrics

In [2]:
RANDOM_SEED = 42

SAMPLE_SIZE = 100

MALLET_OUTPUT_PATH = '/home/pranavgoel/mallet_output/trans_fer_entropy_apr_jun_2023_post_filtering_national_outlets_in_state_media/'

In [3]:
combined_df = pd.read_csv('/home/pranavgoel/trans-fer-entropy/topic_modeling/prior_data_apr_jun_2023/all_texts_combined_post_filtering_national_outlets_in_state_media.csv')

In [4]:
combined_df = combined_df.drop('Unnamed: 0', axis=1)

In [5]:
combined_df['doc_id'] = list(range(len(combined_df)))

In [6]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12451 entries, 0 to 12450
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   media_name    12451 non-null  object
 1   publish_date  12451 non-null  object
 2   title         12451 non-null  object
 3   url           12451 non-null  object
 4   subtitle      10122 non-null  object
 5   text          12451 non-null  object
 6   sent_count    12451 non-null  int64 
 7   domain        11507 non-null  object
 8   media_group   12451 non-null  object
 9   doc_id        12451 non-null  int64 
dtypes: int64(2), object(8)
memory usage: 972.9+ KB


In [7]:
sample_df = combined_df.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED)
sample_df["Relevance_Label"] = [None for _ in range(SAMPLE_SIZE)]

In [8]:
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 396 to 1491
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   media_name       100 non-null    object
 1   publish_date     100 non-null    object
 2   title            100 non-null    object
 3   url              100 non-null    object
 4   subtitle         76 non-null     object
 5   text             100 non-null    object
 6   sent_count       100 non-null    int64 
 7   domain           89 non-null     object
 8   media_group      100 non-null    object
 9   doc_id           100 non-null    int64 
 10  Relevance_Label  0 non-null      object
dtypes: int64(2), object(9)
memory usage: 9.4+ KB


In [9]:
sample_df

Unnamed: 0,media_name,publish_date,title,url,subtitle,text,sent_count,domain,media_group,doc_id,Relevance_Label
396,cleveland.com,2023-06-05 16:31:07,Archie Comics is ready to introduce its first ...,https://www.cleveland.com/reckon/2023/06/archi...,,People are making change and breaking down bar...,26,cleveland.com,ohio,396,
247,wkbn.com,2023-05-10 13:14:05,Transgender athlete ban bill moves forward at ...,https://www.wkbn.com/sports/transgender-athlet...,Lawmakers at the Ohio Statehouse voted on Wedn...,Watch a previous NBC4 report on House Bill 6 i...,28,wkbn.com,ohio,247,
9258,kesq.com,2023-04-29 04:03:27.000000,The US has a rich drag history. Here’s why the...,https://kesq.com/news/2023/04/29/the-us-has-a-...,,"Scottie Andrew, CNN\n\nTo many, the stereotypi...",80,kesq.com,california,9258,
1430,Fox News,2023-06-20 06:00:21,UK teacher calls 8th-grader 'despicable' for s...,https://www.foxnews.com/media/teacher-calls-8t...,A U.K. teacher at Rye College in East Sussex c...,A U.K. teacher got into a heated argument with...,32,,nytimes_foxnews,1430,
10324,nbcbayarea.com,2023-05-23 18:46:49.000000,Target Makes Changes to LGBTQ Merchandise for ...,https://www.nbcbayarea.com/news/national-inter...,Target is removing certain items from its stor...,Target is removing certain items from its stor...,15,nbcbayarea.com,california,10324,
...,...,...,...,...,...,...,...,...,...,...,...
7540,New York Sun,2023-06-19 20:07:27.000000,Plaintiffs Lean on Maine’s Anti-Catholic Histo...,https://www.nysun.com/article/plaintiffs-lean-...,Lawsuit says Maine intentionally amended the s...,Lawsuit says Maine intentionally amended the s...,41,nysun.com,newyork,7540,
4099,KTSM,2023-06-03 18:18:50,Look up: This weekend Venus will be at its hig...,https://www.ktsm.com/news/look-up-this-weekend...,Saturday's Strawberry Moon is just a precursor...,(KSWB) — Saturday’s Strawberry Moon is just a ...,12,ktsm.com,texas,4099,
8144,nbcbayarea.com,2023-04-05 18:16:23.000000,Kid Rock Fires Gun at Cases of Bud Light Amid ...,https://www.nbcbayarea.com/news/national-inter...,The singer apparently did not approve of Anheu...,Kid Rock is making it clear he's not a fan of ...,24,nbcbayarea.com,california,8144,
10500,capoliticalreview.com,2023-05-26 01:45:00.000000,Column: A Black guard shot a Black man for sho...,http://www.capoliticalreview.com/capoliticalne...,,"By\n\nAccording to the L.A. Times, the securit...",98,capoliticalreview.com,california,10500,


In [10]:
num_topics = 20
topic_keys = lmw.load_topic_keys(MALLET_OUTPUT_PATH + '/mallet.topic_keys.' + str(num_topics))
topic_distributions = lmw.load_topic_distributions(MALLET_OUTPUT_PATH + '/mallet.topic_distributions.' + str(num_topics))

In [11]:
#doing a very basic topic assignment -- top topic assigned to each doc
doc_ind_to_topic = {}
for i in range(len(topic_distributions)):
    doc_ind_to_topic[i] = np.argmax(topic_distributions[i])
print(len(doc_ind_to_topic))

12451


In [12]:
#based on manual interpretation of above by just Pranav:
trans_irrelev_topic_inds = [6, 10, 11, 13, 14, 15]

In [13]:
train = sample_df.sample(frac=0.5, random_state=33)
test = sample_df.drop(train.index)

In [14]:
train

Unnamed: 0,media_name,publish_date,title,url,subtitle,text,sent_count,domain,media_group,doc_id,Relevance_Label
4773,suntimes.com,2023-04-17 16:14:22,Boston becomes latest major marathon to debut ...,https://chicago.suntimes.com/2023/4/17/2368708...,The Chicago Marathon added a nonbinary divisio...,BOSTON — Cal Calamia waited extra long to run ...,17,suntimes.com,illinois,4773,
9525,counterpunch.org,2023-05-05 01:52:04.000000,The Time for Queer Revolution Is Now,https://www.counterpunch.org/2023/05/05/the-ti...,,by\n\n“Why don’t you guys do something?” Those...,73,counterpunch.org,california,9525,
7540,New York Sun,2023-06-19 20:07:27.000000,Plaintiffs Lean on Maine’s Anti-Catholic Histo...,https://www.nysun.com/article/plaintiffs-lean-...,Lawsuit says Maine intentionally amended the s...,Lawsuit says Maine intentionally amended the s...,41,nysun.com,newyork,7540,
3120,ksat.com,2023-04-13 19:32:13,Missouri to restrict transgender care for mino...,https://www.ksat.com/news/politics/2023/04/13/...,Missouri’s attorney general has announced new ...,FILE - Missouri Attorney General Andrew Bailey...,43,ksat.com,texas,3120,
10080,The Daily Breeze,2023-05-18 18:05:30.000000,"Out & About: Things to do in the Long Beach, S...",https://www.dailybreeze.com/2023/05/18/out-abo...,Upcoming events and activities for people to e...,May 19\n\n61st Annual Armed Forces Day Celebra...,260,dailybreeze.com,california,10080,
429,wdtn.com,2023-06-08 00:17:34,"Biden invites thousands of LGBTQ+ individuals,...",https://www.wdtn.com/news/u-s-world/ap-us-news...,,WASHINGTON (AP) — President Joe Biden on Thurs...,30,wdtn.com,ohio,429,
6290,The Villager Newspaper | Serving West and East...,2023-05-10 11:31:24.000000,Bottcher welcomes exiled Montana state lawmake...,https://gaycitynews.com/bottcher-welcomes-mont...,"Zooey Zephyr, the out trans state lawmaker who...",Sign up for our amNY Sports email newsletter t...,7,gaycitynews.com,newyork,6290,
1358,Fox News,2023-06-14 20:00:48,Rep. Jim Banks scorches summer camp for reques...,https://www.foxnews.com/media/rep-jim-banks-sc...,Rep. Jim Banks shredded a local summer camp in...,"Rep. Jim Banks, R-Ind., called out a summer ca...",26,,nytimes_foxnews,1358,
9258,kesq.com,2023-04-29 04:03:27.000000,The US has a rich drag history. Here’s why the...,https://kesq.com/news/2023/04/29/the-us-has-a-...,,"Scottie Andrew, CNN\n\nTo many, the stereotypi...",80,kesq.com,california,9258,
11226,indiawest.com,2023-06-08 14:33:45.000000,Ayushmann Khurrana Shows Support For LGBTQIA+ ...,https://indiawest.com/ayushmann-khurrana-shows...,Ayushmann Khurana Shows Support For LGBTQIA+ C...,Ayushmann Khurrana Shows Support For LGBTQIA+ ...,10,indiawest.com,california,11226,


In [15]:
train_doc_inds = list(train['doc_id'])
print(len(train_doc_inds))

50


In [17]:
train_labeled_sample = pd.read_csv('/home/pranavgoel/trans-fer-entropy/internal_relevance_annotation/train_sample_for_relevance_classifier.csv')
print(train_labeled_sample.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   url       50 non-null     object
 1   title     50 non-null     object
 2   subtitle  40 non-null     object
 3   text      50 non-null     object
 4   label     50 non-null     int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ KB
None


In [18]:
assert list(train_labeled_sample['url']) == list(train['url'])

In [20]:
train_labels = train_labeled_sample['label']
print(len(train_labels))

50


In [22]:
pred_labels = []
for i in train_doc_inds:
    topic_ind = doc_ind_to_topic[i]
    if topic_ind in trans_irrelev_topic_inds:
        pred_labels.append(0)
    else:
        pred_labels.append(1)
print(len(pred_labels))

50


In [27]:
metrics.f1_score(train_labels, pred_labels)

0.868421052631579

In [28]:
metrics.confusion_matrix(train_labels, pred_labels)

array([[ 7,  4],
       [ 6, 33]])

In [30]:
df_comp = pd.DataFrame()
df_comp['label'] = train_labels
df_comp['prediction'] = pred_labels
df_comp

Unnamed: 0,label,prediction
0,1,0
1,1,1
2,1,1
3,1,1
4,0,0
5,1,1
6,1,1
7,1,1
8,1,1
9,1,1


In [32]:
for i, pair in enumerate(zip(train_labels, pred_labels)):
    if pair[0]!=pair[1]:
        print(pair[0])
        print(pair[1])
        doc_ind = train_doc_inds[i]
        print(doc_ind_to_topic[doc_ind])
        print('\n===\n')

1
0
6

===

0
1
5

===

0
1
4

===

1
0
10

===

0
1
1

===

0
1
18

===

1
0
6

===

1
0
11

===

1
0
13

===

1
0
15

===

