In [1]:
import pandas as pd
import numpy as np
import random
import math

## Automatic labels

In [2]:
# dict of {channel -> {topic --> True/False (on-topic?)} }
#labeled_dict = {}
#labeled_dict_extra = {}
topics = ['alt-right', 'antitheist', 'politics-left', 'politics-right']

## Manual labels

For each dataset, remove duplicate rows for given topic/channel_id
- note that, had I been diligent enough to ALWAYS label the first occurence, we could use pd.Index.duplicatey(keep='first'). However, I think I had a few mistakes in labeling the last but not first occurence that render this strategy poor to use
- continuously add to an "info" dictionary" that is indexed by topic/channel_id

In [3]:
def remove_dup(x):
    if (x['topic'],x['channel_id']) not in consolidate_dict.keys():
        consolidate_dict[(x['topic'],x['channel_id'])] = {}
    row_dict = consolidate_dict[(x['topic'],x['channel_id'])]
    
    for c in ['on_topic','potential_other_category','update_required','notes']:
        if c not in row_dict.keys():
            row_dict[c] = x[c]

In [4]:
def add_to_labeled_dict(x, labeled_dict):
    #print(x)
    topic = x['topic'].strip()
    channel_id = x['channel_id']
    on_topic = x['on_topic']
    potential_other_category = x['potential_other_category']
    update_required = x['update_required']
    notes = x['notes']
    
    info_dict = {}
    if channel_id in labeled_dict.keys():
        info_dict = labeled_dict[channel_id]
    
    # Explicitly labeled channel_id/topic pairs get placed here
    if topic in topics and type(channel_id) == str and channel_id[:2] == 'UC':
        info_dict[topic] = on_topic

    # If there is some sort of '***' here then we will mark that an update is required
    if type(update_required) == str and len(update_required)>1:
        info_dict['{0}_update_required'.format(topic)]=True
    else:
        info_dict['{0}_update_required'.format(topic)]=False
    
    # If I didn't note any other potential categories, then I can safely assume it's false for other topics too
    if type(potential_other_category) != str:
        for other_topic in topics:
            if other_topic != topic and other_topic not in info_dict.keys():
                info_dict[other_topic] = False
                info_dict['{0}_update_required'.format(other_topic)]=False
    
    # Append all notes taken of the channel
    if 'notes' in info_dict.keys():
        info_dict['notes'] += '|{0}'.format(notes)
    else:
        info_dict['notes'] = '|{0}'.format(notes)

    labeled_dict[channel_id] = info_dict

In [5]:
# Stain topic validation
df_stv = pd.read_csv('./labeled_12-31/channel labeling - stain_topic_validation.csv')
df_stv['on_topic'] = df_stv['on_topic'].replace({'yes':True,'no':False})
df_stv['notes'] = (
    df_stv['notes'] + 
    ['|' for i in range(df_stv.shape[0])] +
    df_stv['Unnamed: 6'].fillna('') + 
    ['|' for i in range(df_stv.shape[0])] +
    df_stv['Unnamed: 7'].fillna('')
)
#df_stv.head()

# scrub end understimation
df_scrub_end = pd.read_csv('./labeled_12-31/channel labeling - end_scrub_underestimation.csv')
#df_scrub_end.head()

df_12_26 = pd.read_csv('./labeled_12-31/Manual labeling 12_26 - homepage_manual_labeling_12-26.csv')
#df_12_26.head()

df_1_2 = pd.read_csv('./labeled_12-31/Manual labeling 12_26 - homepage_manual_labeling_1-2.csv')
#df_1_2.head()

df_1_3 = pd.read_csv('./labeled_12-31/Manual labeling 12_26 - homepage_manual_labeling_1-3.csv')
#df_1_3.head()

df_1_4 = pd.read_csv('./labeled_12-31/Manual labeling 12_26 - rest_labeling_1-4.csv')
df_1_4['potential_other_category'] = ['dummy' for i in range(df_1_4.shape[0])]
#df_1_4.head()

In [6]:
df_1_4.head()

Unnamed: 0,channel_id,topic,on_topic,update_required,notes,potential_other_category
0,UCzcRQ3vRNr6fJ1A9rqFn7QA,alt-right,False,,movie trailers,dummy
1,UCpIT_wPiiiZrQNzMRv_UEag,alt-right,False,,vlogs,dummy
2,UC4mnZmx_t0T-OM70n6ZSeTw,alt-right,False,,soccer,dummy
3,UCWo4IA01TXzBeGJJKWHOG9g,alt-right,False,,Harvard Business Review,dummy
4,UCZRtpJI7ihA6EO49RzPmAbg,alt-right,False,,vlogs,dummy


In [7]:
manual_labels_dict = {}
for df in [df_stv,df_scrub_end,df_12_26,df_1_2,df_1_3,df_1_4]:
    
    # remove duplicate rows for a given channel_id/topic
    consolidate_dict = {}
    df.apply(lambda x: remove_dup(x), axis=1)
    df = pd.DataFrame.from_dict(consolidate_dict,orient='index')
    df.reset_index(inplace=True)
    df.rename(columns={'level_0':'topic','level_1':'channel_id'},inplace=True)
    
    df.apply(lambda x:
        add_to_labeled_dict(x, manual_labels_dict), axis=1
    )

In [8]:
manual_labels_dict['UCVnjt9mMx46gMUeTtONXimQ']

{'politics-left': False,
 'politics-left_update_required': True,
 'notes': '|local news'}

In [9]:
all_man_labels_1_4_df = pd.DataFrame.from_dict(manual_labels_dict,orient='index')
all_man_labels_1_4_df.reset_index(inplace=True)
all_man_labels_1_4_df.rename(columns={'index':'channel_id'},inplace=True)
all_man_labels_1_4_df.head()

Unnamed: 0,channel_id,alt-right,alt-right_update_required,antitheist,antitheist_update_required,politics-left,politics-left_update_required,politics-right,politics-right_update_required,notes
0,UCGTKwjs1ctvgwCRa67RNO-Q,False,False,False,False,False,False,False,False,|heavily-masculine joyr-rides and excavations||
1,UC0QHWhjbe5fGJEPz3sVb6nw,False,False,False,False,False,False,False,False,|doctor mike|||Doctor Mike|nan
2,UCX6OQ3DkcsbYNE6H8uQQuVA,False,False,False,False,False,False,False,False,|mr. beast|||nan|nan|nan|Mr. Beast
3,UCodkb-qBktJI5NrUsPYpf7g,False,False,False,False,False,False,False,False,|Jordan Peterson tips|||nan|nan
4,UCamjnFmK1_lsTWJGWzkC8ew,False,False,False,False,False,False,False,False,|memester||


In [10]:
all_man_labels_1_4_df.to_csv('./important_man_labels_1-4.csv',index=False)


## Apply these labels to our P1/2/3/ data

### Grab the P1/2/3 data

In [18]:
df = pd.read_csv('../all_cleaned_labeled.csv')
df = df[['video_id','channel_id','rank','component','watch_video_id','bot_name','phase','homepage_level','videopage_level','topic','strategy','note','stain']]
df = df[(df['topic']!='random')&(df['rank']<10)]
df.rename(columns={'stain':'on_list'},inplace=True)
df.head()

Unnamed: 0,video_id,channel_id,rank,component,watch_video_id,bot_name,phase,homepage_level,videopage_level,topic,strategy,note,on_list
0,PEWFu0aZroQ,UCXIJgqnII2ZOINSWNOGFThA,0,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False
1,XmiG4KzZ4sg,UCPKAKrjoMz7POptCloy7AIQ,1,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False
2,Lh0b00vL1nI,UCmRZTrJYqOMzP5J0r0snxYg,2,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False
3,k6ucIShHW7Q,UCJquYOG5EL82sKTfH9aMA9Q,3,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False
4,0GCDhADokLY,UC2ggrxKhqgTRx7We9vjQRCQ,4,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False


In [19]:
df_homepage = df[
    (df['component']=='homepage')&
    (df['homepage_level'].isin([1,39,79]))
]

In [20]:
df_videopage = df[(df['phase']=='videopage_experiment')]
df_videopage.head()

Unnamed: 0,video_id,channel_id,rank,component,watch_video_id,bot_name,phase,homepage_level,videopage_level,topic,strategy,note,on_list
0,PEWFu0aZroQ,UCXIJgqnII2ZOINSWNOGFThA,0,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False
1,XmiG4KzZ4sg,UCPKAKrjoMz7POptCloy7AIQ,1,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False
2,Lh0b00vL1nI,UCmRZTrJYqOMzP5J0r0snxYg,2,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False
3,k6ucIShHW7Q,UCJquYOG5EL82sKTfH9aMA9Q,3,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False
4,0GCDhADokLY,UC2ggrxKhqgTRx7We9vjQRCQ,4,videopage,ya2T1XGYTR8,alt-right_delete_0,videopage_experiment,0,0,alt-right,delete,0,False


In [27]:
pd.concat([df_homepage,df_videopage]).groupby(['topic','on_list'])['channel_id'].apply(lambda x: len(x.unique()))

topic           on_list
alt-right       False      747
                True         7
antitheist      False      575
                True        43
politics-left   False      675
                True        18
politics-right  False      543
                True        15
Name: channel_id, dtype: int64

In [26]:
pd.concat([df_homepage,df_videopage]).groupby('on_list')['channel_id'].apply(lambda x: len(x.unique()))

on_list
False    2000
True       83
Name: channel_id, dtype: int64

In [28]:
df_homepage.groupby('on_list')['channel_id'].apply(lambda x: len(x.unique()))

on_list
False    1269
True       65
Name: channel_id, dtype: int64

In [263]:
df_videopage.groupby('bot_name')['videopage_level'].apply(lambda x: x.unique())

bot_name
alt-right_delete_0        [0, 41, 42]
alt-right_delete_1        [0, 41, 42]
alt-right_delete_2            [0, 41]
alt-right_delete_3        [0, 41, 42]
alt-right_delete_4        [0, 40, 41]
                             ...     
politics-right_watch_0    [0, 37, 77]
politics-right_watch_1    [0, 39, 80]
politics-right_watch_2    [0, 39, 80]
politics-right_watch_3    [0, 39, 78]
politics-right_watch_4    [0, 35, 74]
Name: videopage_level, Length: 139, dtype: object

In [264]:
videopage_experiment_levels = []

vp_exp_level = 0
vp_level = 1
bot_name = None

for index, row in df_videopage.iterrows():
    if row['bot_name'] != bot_name:
        bot_name = row['bot_name']
        vp_level = 1
        vp_exp_level = 0
    if row['videopage_level'] != vp_level:
        vp_level = row['videopage_level']
        vp_exp_level += 1
    
    videopage_experiment_levels.append(vp_exp_level)

In [265]:
df_videopage['exp_stage'] = videopage_experiment_levels

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_videopage['exp_stage'] = videopage_experiment_levels


In [266]:
# sanity check
df_videopage.groupby(['bot_name'])['exp_stage'].apply(lambda x: dict(x.value_counts()))

bot_name                 
alt-right_delete_0      1    10.0
                        2    10.0
                        3    10.0
alt-right_delete_1      1    10.0
                        2    10.0
                             ... 
politics-right_watch_3  2    10.0
                        3    10.0
politics-right_watch_4  1    10.0
                        2    10.0
                        3    10.0
Name: exp_stage, Length: 417, dtype: float64

### Label our data 

In [267]:
man_labels_df = pd.read_csv('./important_man_labels_1-4.csv')
man_labels_df.set_index('channel_id',inplace=True)
man_labels_df.head()

Unnamed: 0_level_0,alt-right,alt-right_update_required,antitheist,antitheist_update_required,politics-left,politics-left_update_required,politics-right,politics-right_update_required,notes
channel_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
UCGTKwjs1ctvgwCRa67RNO-Q,False,False,False,False,False,False,False,False,|heavily-masculine joyr-rides and excavations||
UC0QHWhjbe5fGJEPz3sVb6nw,False,False,False,False,False,False,False,False,|doctor mike|||Doctor Mike|nan
UCX6OQ3DkcsbYNE6H8uQQuVA,False,False,False,False,False,False,False,False,|mr. beast|||nan|nan|nan|Mr. Beast
UCodkb-qBktJI5NrUsPYpf7g,False,False,False,False,False,False,False,False,|Jordan Peterson tips|||nan|nan
UCamjnFmK1_lsTWJGWzkC8ew,False,False,False,False,False,False,False,False,|memester||


In [268]:
man_labels_ser = man_labels_df[topics].stack().rename('man_on_topic',inplace=True).rename_axis(['channel_id','topic'])
#man_labels_ser.replace({False:'HELLO'},inplace=True)
man_labels_ser

channel_id                topic         
UCGTKwjs1ctvgwCRa67RNO-Q  alt-right         False
                          antitheist        False
                          politics-left     False
                          politics-right    False
UC0QHWhjbe5fGJEPz3sVb6nw  alt-right         False
                                            ...  
UCVaXclURQZlakiTMzuwHvRw  politics-right    False
UCpAlTKiD0xqo_RRlYGqVX6w  politics-right    False
UC0xaIinhwrms1q5THM37d2Q  politics-right    False
UCp1KrVaZDZ7BOI_QBuTWWmg  politics-right    False
UC4mEUulkjGVSdgKSj5ydxEg  politics-right    False
Name: man_on_topic, Length: 5375, dtype: object

In [269]:
# DONT USE JOIN- IT IS MESSED UP (e.g. ('UCHnyfMqiRRG1u-2MsSQLbXA','alt-right') )
#man_df_homepage = df_homepage.join(man_labels_ser,on=['channel_id','topic'])

In [270]:
df_homepage['new_man_on_topic'] = df_homepage[['channel_id','topic']].apply(
    lambda x: man_labels_ser[(x.channel_id,x.topic)] if (x.channel_id,x.topic) in man_labels_ser.index else pd.NA,
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_homepage['new_man_on_topic'] = df_homepage[['channel_id','topic']].apply(


In [271]:
df_videopage['new_man_on_topic'] = df_videopage[['channel_id','topic']].apply(
    lambda x: man_labels_ser[(x.channel_id,x.topic)] if (x.channel_id,x.topic) in man_labels_ser.index else pd.NA,
    axis=1
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_videopage['new_man_on_topic'] = df_videopage[['channel_id','topic']].apply(


In [272]:
df_homepage[
    (df_homepage['on_list']==False)&
    (df_homepage['new_man_on_topic'].isna())&
    ~(df_homepage['channel_id'].isna())&
    (df_homepage['homepage_level'].isin([1,39,79]))
][['rank','video_id','channel_id','topic','strategy']]
#.to_csv('./homepage_manual_labeling_1-3.csv',index=False)

Unnamed: 0,rank,video_id,channel_id,topic,strategy


In [273]:
df_videopage[
    (df_videopage['on_list']==False)&
    (df_videopage['new_man_on_topic'].isna())&
    ~(df_videopage['channel_id'].isna())&
    (df_videopage['homepage_level'].isin([0,1,2]))
][['rank','video_id','channel_id','topic','strategy']]
#.to_csv('./homepage_manual_labeling_1-3.csv',index=False)

Unnamed: 0,rank,video_id,channel_id,topic,strategy


### Write a final version of the labeled P1/2/3 dataset

In [274]:
df_homepage.shape

(4170, 14)

In [275]:
hp_map = {1:1,39:2,79:3}
df_homepage['exp_stage'] = df_homepage['homepage_level'].apply(lambda x: hp_map[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_homepage['exp_stage'] = df_homepage['homepage_level'].apply(lambda x: hp_map[x])


In [276]:
df_videopage.shape

(4160, 15)

In [277]:
final_df = pd.concat([df_homepage,df_videopage])

In [278]:
final_df.rename(columns={'new_man_on_topic':'manual_label_1-4'},inplace=True)
final_df['on_topic'] = final_df['on_list'] | final_df['manual_label_1-4']

In [279]:
# Sanity check
# Cool!
final_df['on_topic'].isna().sum()

0

In [280]:
# Write it
final_df.to_csv('./bot_runs_labeled_1-4.csv',index=False)