In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import ast
import pickle

%matplotlib notebook

def splitDataFrameList(df,target_column,separator):
	''' df = dataframe to split,
	target_column = the column containing the values to split
	separator = the symbol used to perform the split
	returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
	The values in the other columns are duplicated across the newly divided rows.
	'''
	def splitListToRows(row,row_accumulator,target_column,separator):
		split_row = row[target_column].split(separator)
		for s in split_row:
			new_row = row.to_dict()
			new_row[target_column] = s
			row_accumulator.append(new_row)
	new_rows = []
	df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
	new_df = pd.DataFrame(new_rows)
	return new_df

In [None]:
df = pd.read_csv('../reading_and_cleaning/cleaned_podcasts.csv', sep='\t', index_col=0)
podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)
# df = df.replace(r'', np.nan, regex=True)
# df = df[pd.notnull(df['guests'])]
# split_hosts = splitDataFrameList(df, 'hosts', ', ')

# for index, row in split_hosts.iterrows():
#     if(row['hosts'] == row['guests']):
#         split_hosts.drop(index=index, inplace=True)

G1 = nx.from_pandas_dataframe(split_hosts, 'guests', 'hosts', edge_attr=['date', 'duration', 'podcast'], create_using=nx.MultiDiGraph())
df

In [None]:
len(G1.edges())

In [None]:
import ast

for index1, row1 in podcast_info.iterrows():
    df1 = df[df['podcast'] == row1['Podcast Name']].copy()
    #print(df1.is_copy)
    hosts = ast.literal_eval(row1['Hosts'])
    #print(hosts[0])
    for host in hosts:
        #print(host)
        for index2, row2 in df1.iterrows():
            if(row2['guests'] == host):
                df1.drop(index=index2, inplace=True)
                #print('dropping', row2['guests'])
        #print(host)
    guest_durations1 = df1.groupby(['guests'])['duration'].sum()
    guest_durations1.sort_values(ascending=False, inplace=True)
    filename = 'top_guests/' + row1['Podcast Name'] + '.csv'
    guest_durations1.to_csv(filename)
    print(row1['Podcast Name'], ' - ', guest_durations1.index[0], guest_durations1.values[0])

# joe_rogan = split_hosts[split_hosts['hosts'] == 'Joe Rogan']
# #guest_durations = joe_rogan['duration'].groupby(joe_rogan['guests']).sum()
# guest_durations = joe_rogan.groupby(['guests'])['duration'].sum()

# guest_durations.sort_values(ascending=False, inplace=True)
# guest_durations.index[0]

In [None]:
hosts = ['Brendan Schaub']
for host in hosts:
    print(host)

In [None]:
guest_durations = split_hosts.groupby(['hosts', 'guests'])['duration'].sum()
guest_durations = guest_durations.reset_index()
guest_durations = guest_durations.sort_values(by='duration', ascending=False)
# guest_durations = pd.DataFrame({'duration' : split_hosts.groupby( ['podcast', 'guests'] ).sum()}).reset_index()
guest_durations

In [None]:
guest_durations = pd.read_csv('../reading_and_cleaning/guest_durations.csv', sep='\t', index_col=0)
G1 = nx.from_pandas_dataframe(guest_durations, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.DiGraph())
#print(G1.edges(data=True)[1])


pr = nx.pagerank(G1, weight='duration')
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
save_obj(pr, 'pr_dict')

import operator

sorted_pr = sorted(pr.items(), key=operator.itemgetter(1), reverse=True)
sorted_pr


# top_ten = [i[0] for i in sorted_pr][0:10]
# top_ten

In [None]:
for i in top_ten:
    print(i)
    print(split_hosts[split_hosts['hosts']==i]['date'].iloc[0])

In [None]:
hubs, authorities = nx.hits(G1)


In [None]:
sorted_hubs = sorted(hubs.items(), key=operator.itemgetter(1), reverse=True)
sorted_hubs = pd.DataFrame(sorted_hubs)

In [None]:
sorted_authorities = sorted(authorities.items(), key=operator.itemgetter(1), reverse=True)
sorted_authorities = pd.DataFrame(sorted_authorities)

In [None]:
G2 = nx.from_pandas_dataframe(guest_durations, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.Graph())


In [None]:
nx.center(G2)

In [None]:
nx.periphery(G2)

In [None]:
sorted(nx.eccentricity(G2).items(), key=operator.itemgetter(1), reverse=False)

In [None]:
sorted_close = sorted(nx.closeness_centrality(G2).items(), key=operator.itemgetter(1), reverse=True)
sorted_close = pd.DataFrame(sorted_close)

In [None]:
sorted_bt = sorted(nx.betweenness_centrality(G2).items(), key=operator.itemgetter(1), reverse=True)
sorted_bt = pd.DataFrame(sorted_bt)

In [None]:
sorted_degree = sorted(nx.degree_centrality(G2).items(), key=operator.itemgetter(1), reverse=True)
deg_top_hundred = [i[0] for i in sorted_degree][0:100]
deg_top_hundred = pd.DataFrame(data=deg_top_hundred, columns=['name'])
deg_top_hundred

# sorted_degree = pd.DataFrame(sorted_degree)
# sorted_degree
# sorted_degree_dict = {}
# for index, row in sorted_degree.iterrows():
#     sorted_degree_dict[row[0]] = index
# sorted_degree_dict

In [None]:
sorted(nx.eigenvector_centrality(G2).items(), key=operator.itemgetter(1), reverse=True)

In [None]:
sorted(nx.closeness_vitality(G2).items(), key=operator.itemgetter(1), reverse=True)

In [None]:
node_attr = [pr, hubs, authorities]
nodes_df = pd.DataFrame.from_dict(pr, orient='index')
nodes_df.rename(columns = {0:'pr'}, inplace = True)
nodes_df['hub'] = hubs.values()
nodes_df['auth'] = authorities.values()
nodes_df['eccentricity'] = nx.eccentricity(G2).values()
nodes_df['closeness'] = nx.closeness_centrality(G2).values()
nodes_df['betweenness'] = nx.betweenness_centrality(G2).values()
nodes_df['degree_cen'] = nx.degree_centrality(G2).values()
nodes_df['eigen'] = nx.eigenvector_centrality(G2).values()
nodes_df

In [None]:
podcast_similarities = pd.DataFrame(columns=['podcast1', 'podcast2', 'score'])

for index1, row1 in podcast_info.iterrows():
    summ=0
    df1 = df[df['podcast'] == row1['Podcast Name']]
    guest_durations1 = df1.groupby(['guests'])['duration'].sum()
    guest_durations1 = guest_durations1.reset_index()
    for index2, row2 in podcast_info.iterrows():
        summ=0
        if(index1 >= index2):
            continue
        df2 = df[df['podcast'] == row2['Podcast Name']]
        guest_durations2 = df2.groupby(['guests'])['duration'].sum()
        guest_durations2 = guest_durations2.reset_index()
        for index3, row3 in guest_durations1.iterrows():
            for index4, row4 in guest_durations2.iterrows():
                if(row3['guests'] == row4['guests']):
                    summ += row3['duration']*row4['duration']
        print(row1['Podcast Name'], row2['Podcast Name'], summ)
        index3 = index1+index2
        podcast_similarities.loc[index3] = [row1['Podcast Name'], row2['Podcast Name'], summ]


podcast_similarities.to_csv('podcast_similarities.csv', sep='\t')

In [None]:
import datetime as dt
import time

testdate = dt.datetime(2008, 1, 1, 0, 0, 1)
#print(testdate < df['date'][0])
#dt.datetime.strptime(df['date'][0], '%Y-%m-%d %H:%M:%S')
#testdate < dt.datetime.strptime(df['date'][0], '%Y-%m-%d %H:%M:%S')

#base = dt.datetime.today()
# base = dt.datetime(2018, 4, 1)
# base
#date_list = [base - dt.monthdelta(x) for x in range(0, 144)]
#date_list = [dt.datetime(2008+x/12-x%12, x%12, 1) for x in range(0, 100)]



#dt.datetime.strptime(df['date'][0], '%Y-%m-%d %H:%M:%S')

# print(testdate > date_list[0])


date_list = [dt.datetime(int(2010+(x/12)-(x%12)/12), x%12+1, 1) for x in range(0, 101)]
date_list

In [None]:
dates = [(dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) for x in df['date']]


valid_dates = [(d < dt.datetime(2012, 1, 1)) for d in dates]
df1 = df[valid_dates]

# df1 = df[(d < dt.datetime(2012, 1, 1)) for d in dates]
len(df1)

split_hosts1 = splitDataFrameList(df1, 'hosts', ', ')
guest_durations1 = split_hosts1.groupby(['hosts', 'guests'])['duration'].sum()
guest_durations1 = guest_durations1.reset_index()

G1_1 = nx.from_pandas_dataframe(guest_durations1, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.DiGraph())
pr1 = nx.pagerank(G1_1, weight='duration')

In [None]:
pr1['Joe Rogan']

In [None]:
dates = [(dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) for x in split_hosts['date']]

for date in date_list:
    valid_dates = [(d < date) for d in dates]
    df1 = split_hosts[valid_dates]
    #split_hosts1 = splitDataFrameList(df1, 'hosts', ', ')
    
    guest_durations1 = df1.groupby(['hosts', 'guests'])['duration'].sum()
    guest_durations1 = guest_durations1.reset_index()
    G1_1 = nx.from_pandas_dataframe(guest_durations1, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.DiGraph())
    pr1 = nx.pagerank(G1_1, weight='duration')
    print(date, ' - ', pr1['Joe Rogan'])

In [None]:
import ast

df1 = df
for index1, row1 in podcast_info.iterrows():
    #df1 = df[df['podcast'] == row1['Podcast Name']].copy()
    #print(df1.is_copy)
    hosts = ast.literal_eval(row1['Hosts'])
    #print(hosts[0])
    for host in hosts:
        print(host)
        for index2, row2 in df1.iterrows():
            if(row2['guests'] == host):
                df1.drop(index=index2, inplace=True)

In [None]:
df1 = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)

df1['attr'] = 'guest'

G3 = nx.from_pandas_dataframe(df1, 'guests', 'podcast', edge_attr=['date', 'duration', 'attr'], create_using=nx.Graph())

split_hosts = pd.read_csv('../reading_and_cleaning/split_hosts.csv', sep='\t', index_col=0)
G1 = nx.from_pandas_dataframe(split_hosts, 'guests', 'hosts', edge_attr=['date', 'duration'], create_using=nx.Graph())


#hosts = ast.literal_eval(row1['Hosts'])
podcast_info_split = splitDataFrameList(podcast_info, 'Hosts', ', ')
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.rstrip("'") for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.rstrip('"') for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.rstrip(']') for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.rstrip("'") for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.lstrip('"') for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.lstrip('[') for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.lstrip("'") for g in podcast_info_split['Hosts']]

podcast_info_split['attr'] = 'host'
G4 = nx.from_pandas_dataframe(podcast_info_split, 'Podcast Name', 'Hosts', edge_attr=['attr'], create_using=nx.Graph())



G3.add_edges_from(G4.edges(data=True))

In [None]:
from nltk import jaccard_distance

def ngrams_split(lst, n):
    counts = dict()
    grams = [''.join(lst[i:i+n]) for i in range(len(lst)-n+1)]
    for gram in grams:
        if gram not in counts:
            counts[gram] = 1
        else:
            counts[gram] += 1
    return set(grams)

def word_search(target, words):
    low_score = 100
    prediction = ''
    words = [w for w in words if w[0]==target[0]]
    for i in range(len(words)):
        #test_word = list(words[i])
        score = jaccard_distance(ngrams_split(target,3), ngrams_split(words[i],3))
        if score<low_score:
            low_score=score
            #print(low_score)
            prediction = words[i]
            #print(words[i])
    return prediction#, low_score

correct_spellings = list(G1.nodes())
correct_spellings


print('Nick Thun' not in correct_spellings)

#word_search('Nic Thun', correct_spellings)
correct_spellings

In [None]:
# node1 = 'Russ Roberts'
# node2 = 'Joe Rogan'
# path = nx.shortest_path(G3, node1, node2)
# path_length = nx.shortest_path_length(G3, node1, node2)
#print(node1 + ' was a guest on ' + path[1] + ' who also had as a guest ' + node2 + '.')
#print(node1 + ' is a host of ' + path[1] + ', who had as a guest ' + path[2] + ', who was also a guest on ' + path[3] + ', who also had as a guest ' + node2 + '.')
# path
import csv

correct_spellings = list(G1.nodes())

cr = pd.DataFrame(correct_spellings, columns=["colummn"])
cr.to_csv('correct_spellings.csv', index=False)

def six_degrees(node1, node2):
    
    if(node1 not in correct_spellings):
        suggestion = word_search(node1, correct_spellings)
        message = "Sorry, we couldn't find " + node1 + " in our database. Did you mean " + suggestion + "?"
    elif (node2 not in correct_spellings):
        suggestion = word_search(node2, correct_spellings)
        message = "Sorry, we couldn't find " + node2 + " in our database. Did you mean " + suggestion + "?"
    else:
    
        path = nx.shortest_path(G3, node1, node2)
        path_length = nx.shortest_path_length(G3, node1, node2)
        message = node1
        #message = '<a target="_blank" href="">' + node1 + '</a>'
        
        for step in range(path_length+1):
            print(step, path[step])
            if(step==0):
                continue
            if(step==1):
                if(G3[node1][path[1]]['attr'] == 'host'):
                    message += ' is a host of '
                else:
                    message += ' was a guest on '
                
                message += path[1]
                #message += '<a target="_blank" href="">' + path[1] + '</a>'
                continue
            if(step % 2 == 0):
                if(G3[path[step-1]][path[step]]['attr']=='guest'):
                    if(step==2):
                        message += ', who had as a guest ' + path[step]
                        #message += ', who had as a guest ' + '<a target="_blank" href="">' + path[step] + '</a>'
                    else:
                        message += ', who also had as a guest ' + path[step]
                        #message += ', who also had as a guest ' + '<a target="_blank" href="">' + path[step] + '</a>'
                if(G3[path[step-1]][path[step]]['attr']=='host'):
                    message += ', which is hosted by ' + path[step]
                    #message += ', which is hosted by ' + '<a target="_blank" href="">' + path[step] + '</a>'
            if(step % 2 == 1):
                message += ', who was a guest on ' + path[step]
                #message += ', who was a guest on ' + '<a target="_blank" href="">' + path[step] + '</a>'

    return message
                
six_degrees('Pres. Barack Obama', 'Sam Harris')

In [None]:
six_degrees("Mark-Paul Gosselaar", 'Viggo Mortensen')

In [None]:
host_list = []
for index1, row1 in podcast_info.iterrows():
    hosts = ast.literal_eval(row1['Hosts'])
    for host in hosts:
        host_list.append(host)

host_list = set(host_list)

In [None]:
import pickle
import json

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

top_host_podcast = load_obj('top_host_podcast')
top_guest_podcast = load_obj('top_guest_podcast')

host_podcasts = load_obj('host_podcasts')
guest_podcasts = load_obj('guest_podcasts')

guest_list = list(top_guest_podcast.keys())

top_ten_pr = [i[0] for i in sorted_pr][0:10]
top_fifty_pr = [i[0] for i in sorted_pr][0:50]

top_ten = pd.DataFrame(columns=['name', 'host_bool', 'host_podcast', 'guest_podcast'])
top_ten['name'] = top_ten_pr

top_fifty = pd.DataFrame(columns=['name', 'host_podcast', 'host_podcasts', 
                                  'guest_podcast', 'guest_podcasts', 
                                  'pr_rank', 'hub_rank', 'auth_rank',
                                  'degree_rank', 'bt_rank', 'close_rank'])
top_fifty['name'] = top_fifty_pr

for index, row in top_fifty.iterrows():
    row['pr_rank'] = index+1
    for index1, value1 in sorted_hubs[0].iteritems():
        if(value1==row['name']):
            row['hub_rank'] = index1+1
    for index1, value1 in sorted_authorities[0].iteritems():
        if(value1==row['name']):
            row['auth_rank'] = index1+1
    for index1, value1 in sorted_degree[0].iteritems():
        if(value1==row['name']):
            row['degree_rank'] = index1+1
    for index1, value1 in sorted_close[0].iteritems():
        if(value1==row['name']):
            row['close_rank'] = index1+1
    for index1, value1 in sorted_bt[0].iteritems():
        if(value1==row['name']):
            row['bt_rank'] = index1+1
    if(row['name'] in host_list):
        row['host_podcast'] = top_host_podcast[row['name']]
        row['host_podcasts'] = list(host_podcasts[row['name']])
    else:
        row['host_podcast'] = ''
        row['host_podcasts'] = ''
    if(row['name'] in guest_list):
        row['guest_podcast'] = top_guest_podcast[row['name']]
        row['guest_podcasts'] = list(guest_podcasts[row['name']])
    else:
        row['guest_podcast'] = ''
        row['guest_podcasts'] = ''


dict_list = top_fifty.to_dict(orient='records')
dict_list[1]

dj_list=[]
for i in range(50):
    d = {}
    d['model'] = 'rankings.People'
    d['pk'] = i
    d['fields'] = dict_list[i]
    dj_list.append(d)

    
dj_list


import json
with open('top_fifty_guest_host.json', 'w') as outfile:
    json.dump(dj_list, outfile)





In [None]:
podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)

p_info  = podcast_info.drop(['feedURL', 'keywords', 'cleaner'], axis=1)
p_info.columns=['name', 'hosts', 'imgurl', 'categories']

p_info['podcast_id'] = podcast_info.index-1

dict_list = p_info.to_dict(orient='records')

dj_podcasts=[]
for i in range(len(p_info)):
    d={}
    d['model'] = 'podcasts.Podcasts'
    d['pk'] = i
    d['fields'] = dict_list[i]
    dj_podcasts.append(d)
    
dj_podcasts

import json
with open('podcast_info.json', 'w') as outfile:
    json.dump(dj_podcasts, outfile)

In [None]:
np.round(5.76, 0)

In [None]:
sorted_close = pd.DataFrame(sorted_close)


In [None]:
from datetime import datetime, timedelta

def sec_to_hours(seconds):
    sec = timedelta(seconds=seconds)
    d = datetime(1,1,1) + sec
    
    if (seconds == 2702145):
        return ('31:6:35:45')
    else:
        return("%02d:%02d:%02d:%02d" % (d.day-1, d.hour, d.minute, d.second))
    
sec_to_hours(27021)

In [None]:
# df1 = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)

# guest_durations_podcast = df1.groupby(['podcast', 'podcast_id', 'guests'])['duration'].sum()
# guest_durations_podcast = guest_durations_podcast.reset_index()


guest_durations_podcast = pd.read_csv('../reading_and_cleaning/guest_durations_podcast.csv', sep='\t', index_col=0)
# guest_durations_podcast.sort_values(by='duration', ascending=False, inplace=True)

len(guest_durations_podcast)

# guest_durations_podcast = guest_durations_podcast[guest_durations_podcast['duration']>600]

guest_durations_podcast['hours'] = ''
for index, row in guest_durations_podcast.iterrows():
    #print(row['duration'], sec_to_hours(row['duration']))
    guest_durations_podcast.at[index, 'hours'] = sec_to_hours(row['duration'])



len(guest_durations_podcast)
guest_durations_podcast

In [None]:
from datetime import datetime as dt

now = dt.now()

df1 = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)

dates = [(dt.strptime(x, '%Y-%m-%d %H:%M:%S')) for x in df1['date']]
sec_delta = [(now-date).total_seconds() for date in dates]
sec_delta
df1['sec_delta'] = sec_delta
df1

guest_recent_date = df1.groupby(['podcast', 'podcast_id', 'guests']).agg({'sec_delta':['min', 'count'], 'date':'first'})   #['date', 'sec_delta'].min()
guest_recent_date = guest_recent_date.reset_index()
guest_recent_date[guest_recent_date['podcast']=='The Joe Rogan Experience']['sec_delta']['count']


In [None]:
guest_durations_podcast['recent'] = guest_recent_date['date']
guest_durations_podcast['count'] = guest_recent_date['sec_delta']['count']

guest_durations_podcast.sort_values(by='duration', ascending=False, inplace=True)
guest_durations_podcast.drop(columns='duration', inplace=True)
guest_durations_podcast
# guest_durations_podcast['recent'] = [(dt.strptime(x, '%Y-%m-%d %H:%M:%S')).date() for x in guest_durations_podcast['recent']]
# guest_durations_podcast['recent'][0]

In [None]:
dict_list = guest_durations_podcast.to_dict(orient='records')


gdp_dj=[]
for i in range(len(guest_durations_podcast)):
    d = {}
    d['model'] = 'rankings.Durations'
    d['pk'] = i
    d['fields'] = dict_list[i]
    gdp_dj.append(d)

    
gdp_dj


import json
with open('guest_duration_podcast.json', 'w') as outfile:
    json.dump(gdp_dj, outfile)

In [None]:
podcast_similarities = pd.read_csv('podcast_similarities.csv', sep='\t', index_col=0)
podcast_similarities.sort_values(by='score', ascending=False, inplace=True)
podcast_similarities = podcast_similarities[podcast_similarities['score']>0]
podcast_similarities

similarities = pd.DataFrame(columns=['podcast1', 'podcast2', 'podcast2_id', 'score'])

num = len(podcast_info)
index=0
for index2, row2 in podcast_info.iterrows():
    df1 = podcast_similarities[podcast_similarities['podcast1']==row2['Podcast Name']].copy()
    df2 = podcast_similarities[podcast_similarities['podcast2']==row2['Podcast Name']].copy()
    df = pd.concat([df1, df2], ignore_index=True)
    df.sort_values(by='score', ascending=False, inplace=True)
    df.reset_index(inplace=True)
#     if(index2==2):
#         print(df)
    for index1, row1 in df.iterrows():
        if(index1>7):
            continue
#         if(index2==2):
#             print(row1['podcast1'], row1['podcast2'])
        if(row1['podcast1']==row2['Podcast Name']):
            podcast2 = row1['podcast2']
            
        else:
            podcast2 = row1['podcast1']
        for index3, row3 in podcast_info.iterrows():
            if(row3['Podcast Name']==podcast2):
                podcast2_id = index3-1
        index+=1
        if(row1['score']>0):
            similarities.loc[index] = [row2['Podcast Name'], podcast2, podcast2_id, row1['score']]
        
similarities

In [None]:
dict_list = similarities.to_dict(orient='records')


sim_dj=[]
for i in range(len(dict_list)):
    d = {}
    d['model'] = 'podcasts.Similar'
    d['pk'] = i
    d['fields'] = dict_list[i]
    sim_dj.append(d)

    
sim_dj


import json
with open('similarities.json', 'w') as outfile:
    json.dump(sim_dj, outfile)

In [None]:
df1 = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)
podcasts_per_guest = df1.groupby(['guests'])['podcast'].count()
# podcasts_per_guest = df1.groupby(['guests'])['duration'].sum()

In [None]:
podcasts_per_guest.values

In [None]:
podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)
guest_durations_podcast = pd.read_csv('../reading_and_cleaning/guest_durations_podcast.csv', sep='\t', index_col=0)

guest_durations_podcast

In [None]:
# podcast_info['percent unique']

for index, row in podcast_info.iterrows():
    podcast_df = guest_durations_podcast[guest_durations_podcast['podcast']==row['Podcast Name']].copy()
    num_guest = len(podcast_df)
    num_unique = 0
    for index1, row1 in podcast_df.iterrows():
        guest_df = guest_durations_podcast[guest_durations_podcast['guests']==row1['guests']].copy()
        if(len(guest_df)==1):
            num_unique+=1
    frac_unique = num_unique/num_guest
    print(row['Podcast Name'], frac_unique, num_guest, num_unique)
#     row['percent unique'] = 100*frac_unique

In [None]:
guest_host = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)
guest_host['date'] = pd.to_datetime(guest_host['date'])
guest_host.sort_values(by='date', inplace=True)
guest_host.reset_index(inplace=True, drop=True)
most_recent_date = guest_host['date'].iloc[-1]
guest_host

In [None]:
podcast_info['avg day diff'] = 0
podcast_info['active'] = False

for index, row in podcast_info.iterrows():
    podcast_df = guest_host[guest_host['podcast']==row['Podcast Name']].copy()
    podcast_df.reset_index(inplace=True, drop=True)
    day_diffs = []
    for index1, row1 in podcast_df.iterrows():
        if(index1>0):
            day_diffs.append(((podcast_df['date'][index1]-podcast_df['date'][index1-1]).total_seconds()/86400))
    most_recent_ep_date = podcast_df['date'].iloc[-1]
    days_since_most_recent = (most_recent_date-most_recent_ep_date).total_seconds()/86400
    active = True
    if(days_since_most_recent > 5*np.mean(day_diffs)):
        active = False
    print(row['Podcast Name'], np.round(np.mean(day_diffs),1), active)
    row['avg day diff'] = np.round(np.mean(day_diffs),1)
    row['active'] = active

In [None]:
guest_durations_podcast = pd.read_csv('../reading_and_cleaning/guest_durations_podcast.csv', sep='\t', index_col=0)

G2 = nx.from_pandas_dataframe(guest_durations, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.Graph())
cliques = list(nx.find_cliques(G2))
cliques.sort(key=len,reverse=True)
cliques

In [None]:
import ast

df1 = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)
split_hosts = pd.read_csv('../reading_and_cleaning/split_hosts.csv', sep='\t', index_col=0)
guest_durations = pd.read_csv('../reading_and_cleaning/guest_durations.csv', sep='\t', index_col=0)
G2 = nx.from_pandas_dataframe(guest_durations, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.Graph())

podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)
host_list = []
for index1, row1 in podcast_info.iterrows():
    hosts = ast.literal_eval(row1['Hosts'])
    for host in hosts:
        host_list.append(host)

host_list = set(host_list)


top_category = {}
for node in G2.nodes():
    if node in host_list:
        #print(node)
        df = split_hosts[split_hosts['hosts']==node]
        host_durations = df.groupby(['podcast'])['duration'].sum()
        host_durations = host_durations.reset_index()
        host_durations = host_durations.sort_values(by='duration', ascending=False)
        #print(host_durations['podcast'])
        top_podcast = host_durations['podcast'][0]
        for index, row in podcast_info.iterrows():
            if(row['Podcast Name']==top_podcast):
                top_cat = ast.literal_eval(row['categories'])[0]
                top_category[node] = top_cat
                #print(node, top_cat)
    else:
        df = df1[df1['guests']==node]
        guest_durations = df.groupby(['podcast'])['duration'].sum()
        guest_durations = guest_durations.reset_index()
        guest_durations = guest_durations.sort_values(by='duration', ascending=False)
        top_podcast = guest_durations['podcast'][0]
        for index, row in podcast_info.iterrows():
            if(row['Podcast Name']==top_podcast):
                top_cat = ast.literal_eval(row['categories'])[0]
                top_category[node] = top_cat

In [None]:
import pickle
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

    
top_cat_dict = load_obj('top_categories')
top_cat_dict
df_top_cat = pd.DataFrame(list(top_cat_dict.items()), columns=['name', 'cat'])
top_cats = df_top_cat.groupby(['cat']).count()
top_cats.reset_index(inplace=True)
top_cats.sort_values(by='name', inplace=True, ascending=False)
top_cats.reset_index(inplace=True, drop=True)
top_cats
cat_dict = {}
for index, row in top_cats.iterrows():
    cat_dict[row['cat']] = index
    
cat_dict
top_cats['name'].values

w = top_cats['name'].values/sum(top_cats['name'].values)

top_cats['cat'].values

In [None]:
cat_avg_guests = []
cat_std_guests = []
cat_avg_bias = []
cat_std_bias = []

for cat in top_cats['cat'].values:
    num_podcasts=0
    num_guests = []
    bias = []
    for index, row in podcast_info.iterrows():
        cats = ast.literal_eval(row['categories'])
        if(cat==cats[0]):
            podcast_df = guest_durations_podcast[guest_durations_podcast['podcast']==row['Podcast Name']].copy()
            num_guests.append(len(podcast_df))
            bias.append(bias_dict[row['Podcast Name']])
    cat_avg_guests.append(np.mean(num_guests))
    cat_std_guests.append(np.std(num_guests))
    cat_avg_bias.append(np.mean(bias))
    cat_std_bias.append(np.std(bias))

cat_std_bias

In [None]:
## Category Biases

import ast

w = top_cats['name'].values/sum(top_cats['name'].values)
w_dict = {}
for i in range(len(top_cats)):
    w_dict[top_cats['cat'].values[i]] = w[i]

podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)
guest_durations_podcast = pd.read_csv('../reading_and_cleaning/guest_durations_podcast.csv', sep='\t', index_col=0)
bias_dict = {}
for index, row in podcast_info.iterrows():
    podcast_df = guest_durations_podcast[guest_durations_podcast['podcast']==row['Podcast Name']].copy()
    num_guests = len(podcast_df)
    cats = ast.literal_eval(row['categories'])
    top_cat_podcast = cats[0]
    wi = w_dict[top_cat_podcast]
    num_top_cat = 0
    num_all_cat = 0
    for index1, row1 in podcast_df.iterrows():
        if(top_cat_dict[row1['guests']]==top_cat_podcast):
            num_top_cat+=1
    qi = num_top_cat/num_guests
    if(qi==1):
        beta_i = 'Full Bias'
    else:
        beta_i = str(np.round(np.log(wi)/(np.log(qi)),1))
    print(row['Podcast Name'], beta_i)
    bias_dict[row['Podcast Name']] = np.log(wi)/(np.log(qi)-1e-4)

bias_dict

In [None]:
nx.set_node_attributes(G2, 'category', top_category)
cat_matrix = nx.attribute_mixing_matrix(G2, 'category', normalized=False, mapping=cat_dict)

log_cat_matrix = np.log(cat_matrix+1)
log_cat_matrix

In [None]:
podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)

cats_1 = np.zeros(14)
cats_2 = np.zeros(14)
cats_3 = np.zeros(14)
cats_4 = np.zeros(14)

for index, row in podcast_info.iterrows():
    cats = ast.literal_eval(row['categories'])
    for i in range(len(cats)):
#         print(cats[i])
        if(cats[i] not in top_cats['cat'].values):
            continue
        if(i==0):
            cats_1[cat_dict[cats[i]]]+=1
        if(i==1):
            cats_2[cat_dict[cats[i]]]+=1
        if(i==2):
            cats_3[cat_dict[cats[i]]]+=1
        if(i==3):
            cats_4[cat_dict[cats[i]]]+=1

print(cats_1, cats_2, cats_3, cats_4)
cats_4

In [None]:
import numpy as np
import pandas as pd
import networkx as nx
import pickle
import json
import operator
import csv
import ast
from datetime import timedelta
from datetime import datetime as dt

def sec_to_hours(seconds):
    sec = timedelta(seconds=seconds)
    d = dt(1,1,1) + sec
    
    if (seconds == 2702145):
        return ('31:6:35:45')
    else:
        return("%02d:%02d:%02d:%02d" % (d.day-1, d.hour, d.minute, d.second))

podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)

podcast_info['percent_unique'] = 0.0
podcast_info['num_guests'] = 0
podcast_info['num_unique'] = 0
guest_durations_podcast = pd.read_csv('../reading_and_cleaning/guest_durations_podcast.csv', sep='\t', index_col=0)


for index, row in podcast_info.iterrows():
    podcast_df = guest_durations_podcast[guest_durations_podcast['podcast']==row['Podcast Name']].copy()
    num_guests = len(podcast_df)
    num_unique = 0
    for index1, row1 in podcast_df.iterrows():
        guest_df = guest_durations_podcast[guest_durations_podcast['guests']==row1['guests']].copy()
        if(len(guest_df)==1):
            num_unique+=1
    frac_unique = num_unique/num_guests
    podcast_info.at[index, 'percent_unique'] = np.round(100*frac_unique,1)
    podcast_info.at[index, 'num_guests'] = num_guests
    podcast_info.at[index, 'num_unique'] = num_unique


guest_host = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)
guest_host['date'] = pd.to_datetime(guest_host['date'])
guest_host.sort_values(by='date', inplace=True)
guest_host.reset_index(inplace=True, drop=True)
most_recent_date = guest_host['date'].iloc[-1]



podcast_info['avg_day_diff'] = 0.0
podcast_info['active'] = False
podcast_info['premier'] = ''
podcast_info['avg_ep_lengths'] = ''

for index, row in podcast_info.iterrows():
    podcast_df = guest_host[guest_host['podcast']==row['Podcast Name']].copy()
    podcast_df.reset_index(inplace=True, drop=True)
    podcast_info.at[index, 'premier'] = podcast_df['date'].iloc[0]
    day_diffs = []
    ep_lengths = []
    for index1, row1 in podcast_df.iterrows():
        ep_lengths.append(row1['duration'])
        if(index1>0):
            day_diffs.append(((podcast_df['date'][index1]-podcast_df['date'][index1-1]).total_seconds()/86400))
    most_recent_ep_date = podcast_df['date'].iloc[-1]
    days_since_most_recent = (most_recent_date-most_recent_ep_date).total_seconds()/86400
    active = True
    if(days_since_most_recent > 5*np.mean(day_diffs)):
        active = False
    podcast_info.at[index, 'avg_day_diff'] = np.round(np.mean(day_diffs),1)
    podcast_info.at[index, 'active'] = active
    podcast_info.at[index, 'avg_ep_lengths'] = sec_to_hours(np.round(np.mean(ep_lengths),0))

similarities = pd.read_csv('podcast_similarities.csv', sep='\t', index_col=0)
similarities = similarities[similarities['score']>0]

G1 = nx.from_pandas_dataframe(similarities, 'podcast1', 'podcast2', edge_attr=['score'], create_using=nx.Graph())

sorted_close = sorted(nx.closeness_centrality(G1).items(), key=operator.itemgetter(1), reverse=True)
df_sorted_close = pd.DataFrame(sorted_close)
sorted_close_dict = {}
for index, row in df_sorted_close.iterrows():
    sorted_close_dict[row[0]] = index+1
print("Closeness Centrality Done")

sorted_bt = sorted(nx.betweenness_centrality(G1).items(), key=operator.itemgetter(1), reverse=True)
df_sorted_bt = pd.DataFrame(sorted_bt)
print("Betweenness Centrality Done")
sorted_bt_dict = {}
for index, row in df_sorted_bt.iterrows():
    sorted_bt_dict[row[0]] = index+1

sorted_degree = sorted(nx.degree_centrality(G1).items(), key=operator.itemgetter(1), reverse=True)
df_sorted_degree = pd.DataFrame(sorted_degree)
sorted_degree_dict = {}
for index, row in df_sorted_degree.iterrows():
    sorted_degree_dict[row[0]] = index+1

podcast_info['close_rank'] = 0
podcast_info['bt_rank'] = 0
podcast_info['degree_rank'] = 0

p_info  = podcast_info.drop(['feedURL', 'keywords', 'cleaner'], axis=1)
p_info.columns=['name', 'hosts', 'imgurl', 'categories', 'description', 
                'percent_unique', 'num_guests', 'num_unique', 'avg_day_diff', 
                'active', 'premier', 'avg_ep_lengths', 
                'close_rank', 'bt_rank', 'degree_rank']

for index, row in p_info.iterrows():
    p_info.at[index, 'degree_rank'] = sorted_degree_dict[row['name']]
    p_info.at[index, 'close_rank'] = sorted_close_dict[row['name']]
    p_info.at[index, 'bt_rank'] = sorted_bt_dict[row['name']]


p_info['podcast_id'] = podcast_info.index-1

p_info

In [None]:
sorted_degree_dict['The Joe Rogan Experience']

In [None]:
guest_durations_podcast = pd.read_csv('../reading_and_cleaning/guest_durations_podcast.csv', sep='\t', index_col=0)
guest_durations_podcast[guest_durations_podcast['guests']=='Kevin Pollak']

In [None]:
df1 = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)
guest_durations_podcast = df1.groupby(['podcast', 'guests']).agg({'duration':'sum', 'podcast_id':'first'})
guest_durations_podcast = guest_durations_podcast.reset_index()

In [2]:
def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

p_close = load_obj('p_close')
p_close

{'10% Happier with Dan Harris': 0.6216216216216216,
 'Alison Rosen Is Your New Best Friend': 0.8090452261306532,
 'All Things Comedy Live': 0.6708333333333333,
 'Allegedly with Theo Von & Matthew Cole Weiss': 0.6708333333333333,
 'Anna Faris Is Unqualified': 0.688034188034188,
 "Ari Shaffir's Skeptic Tank": 0.7252252252252253,
 'Armchair Expert with Dax Shepard': 0.5532646048109966,
 'Ask Me Another': 0.6145038167938931,
 'Aubrey Marcus Podcast': 0.6168582375478927,
 "Bertcast's podcast": 0.7740384615384616,
 'Bitch Sesh: A Real Housewives Breakdown': 0.6145038167938931,
 'Brody Stevens Festival Of Sports': 0.5609756097560976,
 'Bulletproof Radio': 0.5420875420875421,
 'Bullseye with Jesse Thorn': 0.7777777777777778,
 'Canceled': 0.4984520123839009,
 'Chris Grosso The Indie Spiritualist': 0.5227272727272727,
 'Comedy Bang Bang': 0.6625514403292181,
 'Comedy Film Nerds': 0.6338582677165354,
 'Conversations with Tyler': 0.5384615384615384,
 'Dear Sugars': 0.48936170212765956,
 'Dom Irrer

In [None]:
np.mod(205,100)

In [3]:
guest_durations_podcast = pd.read_csv('../reading_and_cleaning/guest_durations_podcast.csv', sep='\t', index_col=0)

G3 = nx.from_pandas_dataframe(guest_durations_podcast, 'guests', 'podcast', edge_attr=['duration'], create_using=nx.Graph())
top_cat = load_obj('top_categories_guests')

nx.set_node_attributes(G3, 'community', top_cat)
# G3.node['Joe Rogan']['cat']

list(nx.cn_soundarajan_hopcroft(G3, [('Joe Rogan', 'Duncan Trussell')], community='community'))
# list(nx.ra_index_soundarajan_hopcroft(G3))

G3.node['Joe Rogan']['community']

Joe Rogan community
Duncan Trussell community
Ari Shaffir's Skeptic Tank community


NetworkXAlgorithmError: No community information

In [None]:
top_cat = load_obj('top_categories')
top_cat['Jessica Bluemke']

In [None]:
people = set(guest_durations_podcast['guests'])
if('Jessica Bluemke' in people):
    print('yes')
    

In [None]:
top_cat = load_obj('top_categories_guests')

nx.set_node_attributes(G3, 'cat', top_cat)

In [29]:
import re

data = ['Joey "Coco" Diaz', 'Bill Hader','Joey "Momo" Diaz', 'Joey Diaz', '"Weird Al" Yank']

df = pd.DataFrame(data=data, columns=['guests'])
df
def remove_nickname(df):
	nickname = re.compile('[\w\s]+\"[\w\s]+\"[\w\s]+')
	for index, row in df.iterrows():
		if(pd.notnull(row['guests'])):
			name1 = row['guests']
			if(nickname.search(name1)):
				name2 = re.sub(r'\".*\" ', "", row['guests']).strip()
				if(name1!=name2):
					print(name1, name2)
					df.at[index, 'guests'] = name2
                
                
remove_nickname(df)
df
                
# re.sub(r'\".*\" ', "", 'Joey "Coco" Diaz').strip()

Joey "Coco" Diaz Joey Diaz
Joey "Momo" Diaz Joey Diaz


Unnamed: 0,guests
0,Joey Diaz
1,Bill Hader
2,Joey Diaz
3,Joey Diaz
4,"""Weird Al"" Yank"


In [89]:
import pickle
import operator
from datetime import datetime as dt

def load_obj(name ):
    with open(name + '.pkl', 'rb') as f:
        return pickle.load(f)

podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)

guest_host = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)
guest_host['date'] = pd.to_datetime(guest_host['date'])
guest_host.sort_values(by='date', inplace=True)
guest_host.reset_index(inplace=True, drop=True)

guest_host



sorted_hubs = load_obj('sorted_hubs')
sorted_hubs = sorted(sorted_hubs.items(), key=operator.itemgetter(1))
sorted_hubs

top_hubs = [x[0] for x in sorted_hubs][0:500]
top_hubs

g_hubs = load_obj('g_hubs')
g_hubs['Moshe Kasher']

hub_first_podcast = {}
for hub in top_hubs:
    hub_df = guest_host[guest_host['guests']==hub].copy()
    hub_first_podcast[hub] = hub_df['podcast'].iloc[0]
#     print(hub, hub_df['podcast'].iloc[0])




start = dt(2017, 1, 1)
end = dt(2018, 1, 1)
         
split_hosts = pd.read_csv('../reading_and_cleaning/split_hosts.csv', sep='\t', index_col=0)
guest_durations = split_hosts.groupby(['hosts', 'guests'])['duration'].sum()
guest_durations = guest_durations.reset_index()

split_hosts['date'] = pd.to_datetime(split_hosts['date'])
split_hosts.sort_values(by='date', inplace=True)
split_hosts.reset_index(inplace=True, drop=True)
dates = [x for x in split_hosts['date']]

valid_dates_start = [(d < start) for d in dates]
start_df = split_hosts[valid_dates_start]
start_df.reset_index(inplace=True, drop=True)
         
valid_dates_end = [(d < end) for d in dates]
end_df = split_hosts[valid_dates_end]
end_df.reset_index(inplace=True, drop=True)

print(len(start_df), len(end_df), len(split_hosts))
print(start_df['date'].iloc[-1], end_df['date'].iloc[-1], split_hosts['date'].iloc[-1])

guest_durations_start = start_df.groupby(['hosts', 'guests'])['duration'].sum()
guest_durations_start = guest_durations_start.reset_index()
G1_start = nx.from_pandas_dataframe(guest_durations_start, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.Graph())
bt_start = nx.betweenness_centrality(G1_start)

guest_durations_end = end_df.groupby(['hosts', 'guests'])['duration'].sum()
guest_durations_end = guest_durations_end.reset_index()
G1_end = nx.from_pandas_dataframe(guest_durations_end, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.Graph())
bt_end = nx.betweenness_centrality(G1_end)


# G1_now = nx.from_pandas_dataframe(guest_durations, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.Graph())
# bt_now = nx.betweenness_centrality(G1_now)

in_bt_start = [x for x in bt_start]

host_list = []
for index1, row1 in podcast_info.iterrows():
    hosts = ast.literal_eval(row1['Hosts'])
    for host in hosts:
        host_list.append(host)

host_list = set(host_list)

bt_diff = {}
for key in bt_end:
    if(key not in host_list):
        if(key in in_bt_start):
            bt_diff[key] = bt_end[key]-bt_start[key]
        else:
            bt_diff[key] = bt_end[key]
sorted_bt_diff = sorted(bt_diff.items(), key=operator.itemgetter(1), reverse=True)
sorted_bt_diff


23093 33794 40019
2016-12-31 17:50:21 2017-12-31 23:59:00 2018-06-11 22:12:54


[('Reza Aslan', 0.03027434908238988),
 ('Dan Flores', 0.025488622567003266),
 ('Timothy Snyder', 0.014675274808796844),
 ('William Garriott', 0.009422627275755099),
 ('Alexis Ohanian', 0.00821186697177778),
 ('Gary Taubes', 0.00790069203317155),
 ('Danny Goldberg', 0.00474784145406407),
 ('Naomi Klein', 0.0034049491632342738),
 ('Randy Olson', 0.002676108855666514),
 ('Keeanga-Yamahtta Taylor', 0.0026555676089486113),
 ('danah boyd', 0.00224847020406072),
 ("P.J. O'Rourke", 0.002155239751316357),
 ('Tiffany Haddish', 0.002122982863329847),
 ('Greg Jenner', 0.002077563248131727),
 ('Robert Wright', 0.0019876455032829463),
 ('Wyclef Jean', 0.0018940852268744112),
 ('Jen Kirkman', 0.0017268976806299804),
 ('Mark Duplass', 0.0016522981895537951),
 ('Jordan Peterson', 0.0016097913929590604),
 ('Louis Theroux', 0.00160260982988105),
 ('Tristan Harris', 0.001586111264144425),
 ('Judd Apatow', 0.0015385495713848727),
 ('Yanis Varoufakis', 0.0014453675275694384),
 ('Sean Carroll', 0.00143731632

In [90]:
print(bt_start['Jordan Peterson'], bt_end['Jordan Peterson'])
g_bt = load_obj('g_bt')
g_bt['Jordan Peterson']
print(bt_start['Jordan Peterson'],bt_end['Jordan Peterson'])
top_bt_diff = [x[0] for x in sorted_bt_diff][0:500]
top_bt_diff

0.0 0.0016097913929590604
0.0 0.0016097913929590604


['Reza Aslan',
 'Dan Flores',
 'Timothy Snyder',
 'William Garriott',
 'Alexis Ohanian',
 'Gary Taubes',
 'Danny Goldberg',
 'Naomi Klein',
 'Randy Olson',
 'Keeanga-Yamahtta Taylor',
 'danah boyd',
 "P.J. O'Rourke",
 'Tiffany Haddish',
 'Greg Jenner',
 'Robert Wright',
 'Wyclef Jean',
 'Jen Kirkman',
 'Mark Duplass',
 'Jordan Peterson',
 'Louis Theroux',
 'Tristan Harris',
 'Judd Apatow',
 'Yanis Varoufakis',
 'Sean Carroll',
 'Paul Bloom',
 'Jennifer Pahlka',
 'Jenny Slate',
 'Ari Melber',
 'Charlamagne Tha God',
 'John Lewis',
 'Norman Lear',
 'Tom Hanks',
 'Terrace Martin',
 'Larry Summers',
 'Adam Carolla',
 'Scott Harrison',
 'A$AP Ferg',
 'Big Boi',
 'Guy Ritchie',
 'Josh Davis',
 'Logic',
 "Lawrence O'Donnell",
 'Leah Remini',
 'Griffin Dunne',
 'Jennifer Burns',
 'Thomas Hazlett',
 'Fred Stoller',
 'David Harvey',
 'Shea Serrano',
 'Aisling Bea',
 'Eddie Pepitone',
 'Bonnie McFarlane',
 'Owen Benjamin',
 'Rob Huebel',
 'Sebastian Junger',
 'Dave',
 'Adam Grant',
 'Elizabeth Gi

In [103]:
hub_first_podcast = {}
for hub in top_hubs:
    hub_df = guest_host[guest_host['guests']==hub].copy()
    hub_first_podcast[hub] = hub_df['podcast'].iloc[0]
#     print(hub, hub_df['podcast'].iloc[0])

bt_diff_first_podcast = {}
for bt in top_bt_diff:
    bt_df = guest_host[guest_host['guests']==bt].copy()
    bt_diff_first_podcast[bt] = bt_df['podcast'].iloc[0]
#     print(bt, bt_df['podcast'].iloc[0])

podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)


podcast_info['hub_leader_score'] = 0.0
podcast_info['bt_diff_leader_score'] = 0.0

# for index, row in podcast_info.iterrows():
#     hub_leader_score = 0
#     bt_diff_leader_score = 0
#     for hub in top_hubs:
#         if(hub_first_podcast[hub]==row['Podcast Name']):
#             hub_leader_score += g_hubs[hub]
#             #print(hub, row['Podcast Name'], leader_score, g_hubs[hub])
#     podcast_info.at[index, 'leader_score'] = leader_score
#     for bt in top_bt_diff:
#         if(bt_diff_first_podcast[bt]==row['Podcast Name']):
#             bt_diff_leader_score += bt_diff[bt]
#     print(row['Podcast Name'], hub_leader_score, bt_diff_leader_score)

    
for index, row in podcast_info.iterrows():
    hub_leader_score = 0
    bt_diff_leader_score = 0
    for hub in top_hubs:
        if(hub_first_podcast[hub]==row['Podcast Name']):
            hub_leader_score += g_hubs[hub]
            #print(hub, row['Podcast Name'], leader_score, g_hubs[hub])
    podcast_info.at[index, 'hub_leader_score'] = np.round(100*hub_leader_score,2)
    for bt in top_bt_diff:
        if(bt_diff_first_podcast[bt]==row['Podcast Name']):
            bt_diff_leader_score += bt_diff[bt]
    podcast_info.at[index, 'bt_diff_leader_score'] = np.round(100*bt_diff_leader_score,2)
    #print(row['Podcast Name'], hub_leader_score, bt_diff_leader_score)
    
podcast_info

Unnamed: 0,Podcast Name,Hosts,feedURL,imageURL,categories,keywords,cleaner,description,hub_leader_score,bt_diff_leader_score
1,The Joe Rogan Experience,['Joe Rogan'],http://joeroganexp.joerogan.libsynpro.com/rss,http://static.libsyn.com/p/assets/7/1/f/3/71f3...,"['Comedy', 'Society & Culture', 'Technology']","comedian,joe,monkey,redban,rogan,talking,ufc",clean_joe_rogan,Conduit to the Gaian Mind,2.84,4.28
2,The Duncan Trussell Family Hour,['Duncan Trussell'],http://feeds.feedburner.com/DuncanTrussell,https://dfkfj8j276wwv.cloudfront.net/images/a5...,"['Comedy', 'Arts', 'Religion and Spirituality']",,clean_duncan_trussel,\n <p>Duncan and a special guest explore ...,1.07,0.03
3,Bertcast's podcast,['Bert Kreischer'],http://bertcast.libsyn.com/rss,http://static.libsyn.com/p/assets/0/c/7/c/0c7c...,['Comedy'],"bert,kreischer,machine,the",clean_bert_kreischer,Comic and man of the world Bert Kreischer shar...,0.26,0.11
4,The Fighter & The Kid,"['Brendan Schaub', 'Bryan Callen']",https://rss.art19.com/fighter-and-the-kid,https://dfkfj8j276wwv.cloudfront.net/images/b8...,"['Sports & Recreation', 'Society & Culture', '...","bellator,ufcfightnight,mixed martial arts,ufc ...",clean_tfatk,\n <p>The Fighter &amp; The Kid is a week...,0.00,0.01
5,Ari Shaffir's Skeptic Tank,['Ari Shaffir'],http://shaffir1.libsyn.com/rss,http://static.libsyn.com/p/assets/b/4/2/5/b425...,"['Comedy', 'Education']","allthingscomedy,ari,arithegreat,burr,comedy,de...",clean_ari_shaffir,A comedy podcast to help better understand hum...,1.42,0.13
6,Under The Skin with Russell Brand,['Russell Brand'],https://rss.art19.com/under-the-skin,https://dfkfj8j276wwv.cloudfront.net/images/96...,"['Comedy', 'Society & Culture', 'Philosophy']",,clean_russell_brand,\n My new podcast 'Under The Skin' asks: ...,0.00,0.00
7,Pointless: with Kevin Pereira,['Kevin Pereira'],http://pointlesspod.libsyn.com/rss,http://static.libsyn.com/p/assets/b/c/b/c/bcbc...,['Comedy'],"attack,attackoftheshow,g4,hackmylife,kevinpere...",clean_kevin_pereira,"The podcast of Kevin Pereira, Live from Super ...",0.00,0.00
8,ID10T with Chris Hardwick,['Chris Hardwick'],https://rss.art19.com/id10t,https://dfkfj8j276wwv.cloudfront.net/images/58...,['Comedy'],"ID10T,hardwick",clean_chris_hardwick,\n <p>I am Chris Hardwick. This podcast u...,3.63,1.53
9,Waking Up with Sam Harris,['Sam Harris'],http://wakingup.libsyn.com/rss,http://static.libsyn.com/p/assets/0/b/e/4/0be4...,"['Science & Medicine', 'Society & Culture', 'R...","currentevents,ethics,neuroscience,philosophy,p...",clean_sam_harris,"Join neuroscientist, philosopher, and best-sel...",0.00,0.32
10,Kill Tony,"['Tony Hinchcliffe', 'Brian Redban']",http://www.deathsquad.tv/feed/,http://is1.mzstatic.com/image/thumb/Music62/v4...,"['Comedy', 'Technology', 'TV & Film']",,clean_kill_tony,,0.00,0.00


In [None]:
top_cat_dict = load_obj('top_categories')
top_cat_dict['Greg Hughes']

In [97]:
centralities = ['pr', 'hub', 'auth', 'close', 'bt']

top_ten_pr = 'hello'

for cen in centralities:
    print(type(cen), 'top_ten_' + cen)
    cen_list = 'top_ten_' + cen
    print()
    top_ten = ast.literal_eval(cen_list)
    print(top_ten)

<class 'str'> top_ten_pr


ValueError: malformed node or string: <_ast.Name object at 0x150cea4240>

In [106]:
split_hosts = pd.read_csv('../reading_and_cleaning/split_hosts.csv', sep='\t', index_col=0)
split_hosts['date'] = pd.to_datetime(split_hosts['date'])
split_hosts.sort_values(by='date', inplace=True)

guest_first = split_hosts[split_hosts['guests']=='Joe Rogan']['date'].iloc[0]

In [109]:
link_pred_data = pd.read_csv('link_pred_data.csv', sep='\t', index_col=0)
link_pred_data

Unnamed: 0,podcast,guest,num_guests,percent_unique,avg_day_diff,same_cat,cat_bias,p_close,p_bt,p_degree,...,ra,ja,ad,pa,cn_sh,ra_sh,wic,guest_dur,host_dur,future_link
0,The Joe Rogan Experience,Diana Adams,532,36.8,2.0,0,6.8,0.863158,0.042637,0.841463,...,0.019231,0.500000,0.253085,1315,1,0.000000,0.0,18675799,0,0
1,The Joe Rogan Experience,Jason Carter,532,36.8,2.0,1,6.8,0.863158,0.042637,0.841463,...,0.000000,0.000000,0.000000,1315,0,0.000000,0.0,0,0,0
2,The Joe Rogan Experience,Listener Call's,532,36.8,2.0,1,6.8,0.863158,0.042637,0.841463,...,0.218182,6.036112,7.050604,1315,67,0.187879,31000.0,26698238796,184279174,0
3,The Joe Rogan Experience,Carolyn L. Kane,532,36.8,2.0,0,6.8,0.863158,0.042637,0.841463,...,0.000000,0.000000,0.000000,1315,0,0.000000,0.0,0,0,0
4,The Joe Rogan Experience,Mike Cummings,532,36.8,2.0,1,6.8,0.863158,0.042637,0.841463,...,0.151515,1.089540,2.386832,1315,20,0.151515,10000.0,267143800,0,0
5,The Joe Rogan Experience,Riley Reid,532,36.8,2.0,1,6.8,0.863158,0.042637,0.841463,...,0.310486,6.295818,8.352675,2630,82,0.310486,41000.0,835129607,0,0
6,The Joe Rogan Experience,Claire Hoffman,532,36.8,2.0,0,6.8,0.863158,0.042637,0.841463,...,0.000000,0.000000,0.000000,1315,0,0.000000,0.0,0,0,0
7,The Joe Rogan Experience,James Pennebaker,532,36.8,2.0,1,6.8,0.863158,0.042637,0.841463,...,0.018349,0.311111,0.426317,1315,4,0.018349,2000.0,62254731,0,0
8,The Joe Rogan Experience,Katherine Ozment,532,36.8,2.0,0,6.8,0.863158,0.042637,0.841463,...,0.008130,0.250000,0.207806,1315,1,0.000000,0.0,3139920,0,0
9,The Joe Rogan Experience,James Ladyman,532,36.8,2.0,0,6.8,0.863158,0.042637,0.841463,...,0.013158,0.166667,0.230908,1315,2,0.013158,1000.0,19246744,0,0


In [128]:
p_info = pd.read_csv('p_info.csv', sep='\t', index_col=0)

p_info_1of4 = p_info[p_info.index < 41]
mask1 = p_info.index > 40
p_info_2of4 = p_info[mask1]
mask2 = p_info_2of4.index < 82
p_info_2of4 = p_info_2of4[mask2]

mask3 = p_info.index > 81
p_info_3of4 = p_info[mask3]
mask4 = p_info_3of4.index < 123
p_info_3of4 = p_info_3of4[mask4]

mask5 = p_info.index > 122
p_info_4of4 = p_info[mask5]
p_info_2of4


p_info_1of4.to_csv('p_info_1of4.csv', sep='\t')
p_info_2of4.to_csv('p_info_2of4.csv', sep='\t')
p_info_3of4.to_csv('p_info_3of4.csv', sep='\t')
p_info_4of4.to_csv('p_info_4of4.csv', sep='\t')

In [129]:
p_info_1of4

Unnamed: 0,name,hosts,imgurl,categories,description,percent_unique,num_guests,num_unique,avg_day_diff,active,premier,avg_ep_lengths,cat_bias,hub_leader_score,bt_diff_leader_score,close_rank,bt_rank,degree_rank
1,The Joe Rogan Experience,['Joe Rogan'],http://static.libsyn.com/p/assets/7/1/f/3/71f3...,"['Comedy', 'Society & Culture', 'Technology']",Conduit to the Gaian Mind,36.8,532,196,2.0,True,2009-12-24 05:00:00,02:38:04,6.8,0,0,2,1,2
2,The Duncan Trussell Family Hour,['Duncan Trussell'],https://dfkfj8j276wwv.cloudfront.net/images/a5...,"['Comedy', 'Arts', 'Religion and Spirituality']",\n <p>Duncan and a special guest explore ...,35.1,171,60,4.0,True,2010-01-10 23:27:00,01:36:34,7.5,0,0,5,6,5
3,Bertcast's podcast,['Bert Kreischer'],http://static.libsyn.com/p/assets/0/c/7/c/0c7c...,['Comedy'],Comic and man of the world Bert Kreischer shar...,28.0,250,70,5.3,True,2012-12-12 16:36:00,02:04:33,13.6,0,0,7,11,6
4,The Fighter & The Kid,"['Brendan Schaub', 'Bryan Callen']",https://dfkfj8j276wwv.cloudfront.net/images/b8...,"['Sports & Recreation', 'Society & Culture', '...",\n <p>The Fighter &amp; The Kid is a week...,29.4,109,32,6.5,True,2015-08-14 06:01:30,01:45:46,5.2,0,0,17,30,18
5,Ari Shaffir's Skeptic Tank,['Ari Shaffir'],http://static.libsyn.com/p/assets/b/4/2/5/b425...,"['Comedy', 'Education']",A comedy podcast to help better understand hum...,47.6,273,130,5.6,True,2011-09-28 20:00:13,02:16:08,58.6,0,0,18,23,15
6,Under The Skin with Russell Brand,['Russell Brand'],https://dfkfj8j276wwv.cloudfront.net/images/96...,"['Comedy', 'Society & Culture', 'Philosophy']",\n My new podcast 'Under The Skin' asks: ...,66.0,47,31,7.9,True,2017-03-09 04:00:00,01:08:45,7.2,0,0,136,51,136
7,Pointless: with Kevin Pereira,['Kevin Pereira'],http://static.libsyn.com/p/assets/b/c/b/c/bcbc...,['Comedy'],"The podcast of Kevin Pereira, Live from Super ...",55.3,94,52,11.9,False,2014-12-08 08:00:00,01:28:23,41.3,0,0,71,87,71
8,ID10T with Chris Hardwick,['Chris Hardwick'],https://dfkfj8j276wwv.cloudfront.net/images/58...,['Comedy'],\n <p>I am Chris Hardwick. This podcast u...,43.0,772,332,3.2,True,2010-02-08 17:09:00,01:14:59,9.1,0,0,1,2,1
9,Waking Up with Sam Harris,['Sam Harris'],http://static.libsyn.com/p/assets/0/b/e/4/0be4...,"['Science & Medicine', 'Society & Culture', 'R...","Join neuroscientist, philosopher, and best-sel...",44.3,88,39,12.8,True,2014-10-28 17:23:34,01:47:20,9.5,0,0,111,75,116
10,Kill Tony,"['Tony Hinchcliffe', 'Brian Redban']",http://is1.mzstatic.com/image/thumb/Music62/v4...,"['Comedy', 'Technology', 'TV & Film']",,11.3,97,11,1.0,False,2016-07-26 00:21:46,01:40:14,98.2,0,0,35,60,35
