In [93]:
import pandas as pd

# Exploration into h et al. and mbfc similarity

In [94]:
df_h = pd.read_csv('hosseinmardi_labels/h_labels.csv')
df_h.head()

Unnamed: 0,CHANNEL_TITLE,CHANNEL_ID,H_TAG
0,GBNews,UC0vn8ISa4LKMunLbzaXLnOQ,AW
1,The Black Conservative Preacher,UC_QYq3wFd46rcaoKOvPHFgA,FR
2,Energy and Commerce Committee,UCCbD3bkHRcwiBsaL1lWE_QQ,C
3,That Star Wars Girl,UCRWsFMLzGGJKLnAXJ9U93_A,AW
4,Justin Murphy,UCfrlXVXz-pT9AmmDELhpyOw,AW


In [95]:
df_mbfc = pd.read_csv('media_ratings.csv')
df_mbfc = df_mbfc[pd.isna(df_mbfc['channel_id'])==False]
df_mbfc.head()

Unnamed: 0,title,country,mbfc_page,bias_raw,factual,label,url,channel_id
0,Alliance for Justice (AFJ),US,left,left8,High,L,https://www.afj.org/,UC4chaUevLgCVSz-RcgqgoZQ
1,Act.TV,US,left,left4,MostlyFactual,L,http://act.tv,UC7yoXZ7c8ZZAimgcqHBtHoQ
4,Al DIA,US,left,left4,Mixed,L,https://aldianews.com,UCGrUeq1FtRamLqZgaCIgB3w
6,American Bridge 21st Century,US,left,left5,Mixed,L,https://americanbridgepac.org/,UCvubOhuUIhYoYnRyXHtcygQ
8,American Oversight,US,left,left7,Mixed,L,https://www.americanoversight.org,UCipcOR6j5MuCzZXXnhfIyjw


In [96]:
# AHHHH I see now...
# This is the number of mbfc channels that aren't included in h et al.
# (by id)
mbfc_id_channels = set(df_mbfc['channel_id'])
h_id_channels = set(df_h['CHANNEL_ID'])
len(mbfc_id_channels - h_id_channels)

955

In [97]:
mbfc_name_channels = set(df_mbfc['title'])
h_name_channels = set(df_h['CHANNEL_TITLE'])
len(mbfc_name_channels - h_name_channels)

1110

In [98]:
# With channels that are in mbfc but not h et al., 
# add them along with their mbfc leaning label
# In all likelihood there are id/channel repeats
added_mbfc_channels = set.union(mbfc_id_channels - h_id_channels, mbfc_name_channels - h_name_channels)
len(added_mbfc_channels)

2065

# Combining h et al. and mbfc labels

In [99]:
def in_h(mbfc_id, mbfc_name, h_id_channels, h_name_channels):
    if mbfc_id in h_id_channels or mbfc_name in h_name_channels:
        return True
    return False
already_in_h = df_mbfc.apply(
    lambda row: in_h(row['channel_id'],  row['title'], h_id_channels, h_name_channels),
    axis = 1
)
df_mbfc['already_in_h'] = already_in_h
df_mbfc.head()

Unnamed: 0,title,country,mbfc_page,bias_raw,factual,label,url,channel_id,already_in_h
0,Alliance for Justice (AFJ),US,left,left8,High,L,https://www.afj.org/,UC4chaUevLgCVSz-RcgqgoZQ,False
1,Act.TV,US,left,left4,MostlyFactual,L,http://act.tv,UC7yoXZ7c8ZZAimgcqHBtHoQ,True
4,Al DIA,US,left,left4,Mixed,L,https://aldianews.com,UCGrUeq1FtRamLqZgaCIgB3w,False
6,American Bridge 21st Century,US,left,left5,Mixed,L,https://americanbridgepac.org/,UCvubOhuUIhYoYnRyXHtcygQ,False
8,American Oversight,US,left,left7,Mixed,L,https://www.americanoversight.org,UCipcOR6j5MuCzZXXnhfIyjw,False


In [100]:
pd.unique(df_mbfc['label'])

array(['L', 'LC', 'C', 'RC', 'R', 'ER', 'EL'], dtype=object)

In [101]:
# We need to do some mapping here, as political labels are not the same!
mbfc_h_map = {
    'EL': 'FL',
    'L': 'L',
    'LC': 'L',
    'C': 'C',
    'RC': 'R',
    'R': 'R',
    'ER': 'FR',
}
h_labels = df_mbfc.apply(
    lambda row: mbfc_h_map[row['label']],
    axis = 1
)
df_mbfc['h_label'] = h_labels
df_mbfc.head()

Unnamed: 0,title,country,mbfc_page,bias_raw,factual,label,url,channel_id,already_in_h,h_label
0,Alliance for Justice (AFJ),US,left,left8,High,L,https://www.afj.org/,UC4chaUevLgCVSz-RcgqgoZQ,False,L
1,Act.TV,US,left,left4,MostlyFactual,L,http://act.tv,UC7yoXZ7c8ZZAimgcqHBtHoQ,True,L
4,Al DIA,US,left,left4,Mixed,L,https://aldianews.com,UCGrUeq1FtRamLqZgaCIgB3w,False,L
6,American Bridge 21st Century,US,left,left5,Mixed,L,https://americanbridgepac.org/,UCvubOhuUIhYoYnRyXHtcygQ,False,L
8,American Oversight,US,left,left7,Mixed,L,https://www.americanoversight.org,UCipcOR6j5MuCzZXXnhfIyjw,False,L


In [102]:
additional_df_mbfc = df_mbfc[df_mbfc['already_in_h']==False]
additional_df_mbfc.head()

Unnamed: 0,title,country,mbfc_page,bias_raw,factual,label,url,channel_id,already_in_h,h_label
0,Alliance for Justice (AFJ),US,left,left8,High,L,https://www.afj.org/,UC4chaUevLgCVSz-RcgqgoZQ,False,L
4,Al DIA,US,left,left4,Mixed,L,https://aldianews.com,UCGrUeq1FtRamLqZgaCIgB3w,False,L
6,American Bridge 21st Century,US,left,left5,Mixed,L,https://americanbridgepac.org/,UCvubOhuUIhYoYnRyXHtcygQ,False,L
8,American Oversight,US,left,left7,Mixed,L,https://www.americanoversight.org,UCipcOR6j5MuCzZXXnhfIyjw,False,L
10,Americans United for Separation of Church and ...,US,left,left12,High,L,https://www.au.org/,UCTioRIsPAFBF1KqkUEf0guA,False,L


In [103]:
additional_df_mbfc = additional_df_mbfc.rename(columns={"title": "CHANNEL_TITLE", "channel_id": "CHANNEL_ID", "h_label": "H_TAG"})
additional_df_mbfc.head()

Unnamed: 0,CHANNEL_TITLE,country,mbfc_page,bias_raw,factual,label,url,CHANNEL_ID,already_in_h,H_TAG
0,Alliance for Justice (AFJ),US,left,left8,High,L,https://www.afj.org/,UC4chaUevLgCVSz-RcgqgoZQ,False,L
4,Al DIA,US,left,left4,Mixed,L,https://aldianews.com,UCGrUeq1FtRamLqZgaCIgB3w,False,L
6,American Bridge 21st Century,US,left,left5,Mixed,L,https://americanbridgepac.org/,UCvubOhuUIhYoYnRyXHtcygQ,False,L
8,American Oversight,US,left,left7,Mixed,L,https://www.americanoversight.org,UCipcOR6j5MuCzZXXnhfIyjw,False,L
10,Americans United for Separation of Church and ...,US,left,left12,High,L,https://www.au.org/,UCTioRIsPAFBF1KqkUEf0guA,False,L


In [104]:
final_df = pd.concat([additional_df_mbfc, df_h], join="inner")
final_df.head()

Unnamed: 0,CHANNEL_TITLE,CHANNEL_ID,H_TAG
0,Alliance for Justice (AFJ),UC4chaUevLgCVSz-RcgqgoZQ,L
4,Al DIA,UCGrUeq1FtRamLqZgaCIgB3w,L
6,American Bridge 21st Century,UCvubOhuUIhYoYnRyXHtcygQ,L
8,American Oversight,UCipcOR6j5MuCzZXXnhfIyjw,L
10,Americans United for Separation of Church and ...,UCTioRIsPAFBF1KqkUEf0guA,L


In [105]:
final_df.shape

(8194, 3)

In [106]:
final_df.to_csv('h_mbfc_labels.csv', index=False)