In [2]:
import numpy as np
import pandas as pd
import networkx as nx

%matplotlib notebook

def splitDataFrameList(df,target_column,separator):
	''' df = dataframe to split,
	target_column = the column containing the values to split
	separator = the symbol used to perform the split
	returns: a dataframe with each entry for the target column separated, with each element moved into a new row. 
	The values in the other columns are duplicated across the newly divided rows.
	'''
	def splitListToRows(row,row_accumulator,target_column,separator):
		split_row = row[target_column].split(separator)
		for s in split_row:
			new_row = row.to_dict()
			new_row[target_column] = s
			row_accumulator.append(new_row)
	new_rows = []
	df.apply(splitListToRows,axis=1,args = (new_rows,target_column,separator))
	new_df = pd.DataFrame(new_rows)
	return new_df

In [77]:
df = pd.read_csv('../reading_and_cleaning/cleaned_podcasts.csv', sep='\t', index_col=0)
podcast_info = pd.read_csv('../reading_and_cleaning/meta_podcast_info.csv', sep='\t', index_col=0)
df = df.replace(r'', np.nan, regex=True)
df = df[pd.notnull(df['guests'])]
split_hosts = splitDataFrameList(df, 'hosts', ', ')

for index, row in split_hosts.iterrows():
    if(row['hosts'] == row['guests']):
        split_hosts.drop(index=index, inplace=True)

G1 = nx.from_pandas_dataframe(split_hosts, 'guests', 'hosts', edge_attr=['date', 'duration', 'podcast'], create_using=nx.MultiDiGraph())

In [None]:
len(G1.edges())

In [None]:
import ast

for index1, row1 in podcast_info.iterrows():
    df1 = df[df['podcast'] == row1['Podcast Name']].copy()
    #print(df1.is_copy)
    hosts = ast.literal_eval(row1['Hosts'])
    #print(hosts[0])
    for host in hosts:
        #print(host)
        for index2, row2 in df1.iterrows():
            if(row2['guests'] == host):
                df1.drop(index=index2, inplace=True)
                #print('dropping', row2['guests'])
        #print(host)
    guest_durations1 = df1.groupby(['guests'])['duration'].sum()
    guest_durations1.sort_values(ascending=False, inplace=True)
    filename = 'top_guests/' + row1['Podcast Name'] + '.csv'
    guest_durations1.to_csv(filename)
    print(row1['Podcast Name'], ' - ', guest_durations1.index[0], guest_durations1.values[0])

# joe_rogan = split_hosts[split_hosts['hosts'] == 'Joe Rogan']
# #guest_durations = joe_rogan['duration'].groupby(joe_rogan['guests']).sum()
# guest_durations = joe_rogan.groupby(['guests'])['duration'].sum()

# guest_durations.sort_values(ascending=False, inplace=True)
# guest_durations.index[0]

In [None]:
hosts = ['Brendan Schaub']
for host in hosts:
    print(host)

In [74]:
guest_durations = split_hosts.groupby(['hosts', 'guests'])['duration'].sum()
guest_durations = guest_durations.reset_index()
guest_durations = guest_durations.sort_values(by='duration', ascending=False)
# guest_durations = pd.DataFrame({'duration' : split_hosts.groupby( ['podcast', 'guests'] ).sum()}).reset_index()
guest_durations

Unnamed: 0,hosts,guests,duration
10984,Joe Rogan,Brian Redban,2702145
11569,Joey Diaz,Lee Syatt,1144116
21512,Tony Hinchcliffe,Pat Regan,532567
2532,Brian Redban,Pat Regan,532567
21493,Tony Hinchcliffe,Josh Martin,515083
2515,Brian Redban,Josh Martin,515083
7131,Doug Stanhope,Greg Chaille,514973
21483,Tony Hinchcliffe,Jeremiah Watkins,514592
2505,Brian Redban,Jeremiah Watkins,510588
21490,Tony Hinchcliffe,Joel Jimenez,503881


In [249]:
guest_durations = pd.read_csv('../reading_and_cleaning/guest_durations.csv', sep='\t', index_col=0)
G1 = nx.from_pandas_dataframe(guest_durations, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.DiGraph())
#print(G1.edges(data=True)[1])


pr = nx.pagerank(G1, weight='duration')
def save_obj(obj, name ):
    with open(name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
save_obj(pr, 'pr_dict')

import operator

sorted_pr = sorted(pr.items(), key=operator.itemgetter(1), reverse=True)
sorted_pr


# top_ten = [i[0] for i in sorted_pr][0:10]
# top_ten


zero out-degree for node Chris Evert


zero out-degree for node John Calipari


zero out-degree for node Eden Dranger


zero out-degree for node Justine Marino


zero out-degree for node Peter Segal



[('Joe Rogan', 0.12906921701495516),
 ('Duncan Trussell', 0.047113024812504053),
 ('Ari Shaffir', 0.029058476864555),
 ('Chris Hardwick', 0.0256170418417006),
 ('Jesse Thorn', 0.020842880554074596),
 ('Joey Diaz', 0.02082083491677391),
 ('Alison Rosen', 0.0205902068620873),
 ('Bert Kreischer', 0.019755203136344886),
 ('Jordan Morris', 0.01749342492395645),
 ('Christopher Ryan', 0.016859934844532933),
 ('Pete Holmes', 0.016424263043207675),
 ('Kumail Nanjiani', 0.015384817460312476),
 ('Emily V. Gordon', 0.01157989549479349),
 ('Tom Segura', 0.01028924705867329),
 ('Christina Pazsitzky', 0.01028924705867329),
 ('Rhea Butcher', 0.009623575409774883),
 ('Cameron Esposito', 0.009623575409774883),
 ('Doug Benson', 0.00954354180611569),
 ('Tony Hinchcliffe', 0.009178837756963739),
 ('Brian Redban', 0.00892227051466748),
 ('Bill Maher', 0.007549499225533234),
 ('Aubrey Marcus', 0.007328833416949321),
 ('Jay Larson', 0.007303859317285023),
 ('Ryan Sickler', 0.007303859317285023),
 ('James Altu

In [None]:
for i in top_ten:
    print(i)
    print(split_hosts[split_hosts['hosts']==i]['date'].iloc[0])

In [96]:
hubs, authorities = nx.hits(G1)


In [97]:
sorted_hubs = sorted(hubs.items(), key=operator.itemgetter(1), reverse=True)
sorted_hubs

[('Pete Holmes', 0.0022344654226027854),
 ('Nick Thune', 0.0022258857756911154),
 ('Moshe Kasher', 0.0021738441962355567),
 ('Jen Kirkman', 0.002150458394470385),
 ('Kurt Braunohler', 0.002133312390902367),
 ('Kumail Nanjiani', 0.002123794821191265),
 ('Eddie Pepitone', 0.002121655110336465),
 ('Jonah Ray', 0.0020668243477404346),
 ('Jackie Kashian', 0.0020548547080180084),
 ('Nikki Glaser', 0.0020319035273767914),
 ('Kyle Kinane', 0.00201567975944055),
 ('Matt Braunger', 0.0019629921420285623),
 ('Ari Shaffir', 0.0018433330856542174),
 ('Bert Kreischer', 0.0018378091828213149),
 ('Todd Glass', 0.0018011241225538866),
 ('Myq Kaplan', 0.001732438199205479),
 ('Mark Normand', 0.0017302558799000005),
 ('Steve Agee', 0.0017045399250924928),
 ('Paul F. Tompkins', 0.0016475732334775498),
 ('Shane Mauss', 0.0016462154377212257),
 ('Ron Funches', 0.001599179655428034),
 ('Graham Elwood', 0.001576145090627357),
 ('Guy Branum', 0.0015565255609002145),
 ('Jake Weisman', 0.0015535893341426465),
 (

In [98]:
sorted_authorities = sorted(authorities.items(), key=operator.itemgetter(1), reverse=True)
sorted_authorities

[('Cameron Esposito', 0.05653067883436643),
 ('Rhea Butcher', 0.05653067883436643),
 ('Chris Hardwick', 0.04698244926854523),
 ('Pete Holmes', 0.03890451428851555),
 ('Jesse Thorn', 0.03549043441568066),
 ('Ryan Sickler', 0.02903030170834555),
 ('Jay Larson', 0.02903030170834555),
 ('Jordan Morris', 0.027798666296549715),
 ('Alison Rosen', 0.024942761835196975),
 ('Joe Rogan', 0.02448520163258655),
 ('Jason Sklar', 0.02333879809201262),
 ('Randy Sklar', 0.02333879809201262),
 ('Doug Benson', 0.02213417203448642),
 ('Bert Kreischer', 0.022072508415502436),
 ('Ari Shaffir', 0.019645574633169596),
 ('John Roy', 0.01964240430848081),
 ('Todd Barry', 0.015890297536901365),
 ('Dan St. Germain', 0.015868934198834894),
 ('Sean Donnelly', 0.015868934198834894),
 ('Todd Glass', 0.013475024698992255),
 ('Graham Clark', 0.011561864583397762),
 ('Dave Shumka', 0.011561864583397762),
 ('Tony Hinchcliffe', 0.010849671883958686),
 ('Brian Redban', 0.010754046163687752),
 ('Duncan Trussell', 0.01040570

In [23]:
G2 = nx.from_pandas_dataframe(guest_durations, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.Graph())


In [100]:
nx.center(G2)

['Jackie Kashian',
 'Kumail Nanjiani',
 'Bert Kreischer',
 'Duncan Trussell',
 'Jesse Thorn',
 'Moshe Kasher',
 'Cameron Esposito',
 'Tim Ferriss',
 'Pete Holmes',
 'Alison Rosen',
 'Neal Brennan',
 'Margaret Cho',
 'Myq Kaplan']

In [101]:
nx.periphery(G2)

['Ruenactments',
 'Queen',
 'Paul Finebaum',
 'Freda Payne',
 'Meditation',
 'Suroosh Alvi',
 'Rachel Platten',
 'Part 1',
 'Jefferson Bethke',
 'R.C. Maxwell',
 'Yoram Bauman',
 'Philip Levine',
 'Tom Haberstroh',
 'Mallory Edens',
 'Nail Art',
 'Julie Borowski',
 'recently departed ESPNer Tom Haberstroh',
 'Rebecca Harding',
 'George Magnus',
 'Siedah Garrett',
 'Twitch Boss',
 'Graham Brownlow',
 'Maureen',
 'the head coach',
 'Make Mistakes',
 'Mary Carillo',
 'Graham Norton',
 'Jon Levy',
 'Frances Davis',
 'Jared Dudley',
 'Positive Mind',
 'Stephanie Ruhle',
 'Peita Diamantidis',
 'Trey Kerby',
 'Mariah Balenciaga',
 'Randy Hetrick',
 'Morning Routine',
 'Tommy Sotomayor',
 'Titus Burgess',
 'Dr. Chris Donaghue',
 'Milo',
 'Ted Cruz',
 'Novak Djokovic',
 'Arthur Blank',
 'Robbie Butler',
 'Oliver Payne',
 'Jonathan Fields',
 'Jane Wurwand',
 'Helen Zaltman',
 'Charles Barkley',
 'NBA confidant Carl Lentz',
 'Mike Huckabee',
 'Leah Bell',
 'Eileen Fisher',
 'Van Hunt',
 'Chi Chi 

In [102]:
sorted(nx.eccentricity(G2).items(), key=operator.itemgetter(1), reverse=False)

[('Jackie Kashian', 4),
 ('Kumail Nanjiani', 4),
 ('Bert Kreischer', 4),
 ('Duncan Trussell', 4),
 ('Jesse Thorn', 4),
 ('Moshe Kasher', 4),
 ('Cameron Esposito', 4),
 ('Tim Ferriss', 4),
 ('Pete Holmes', 4),
 ('Alison Rosen', 4),
 ('Neal Brennan', 4),
 ('Margaret Cho', 4),
 ('Myq Kaplan', 4),
 ('Dwayne Kennedy', 5),
 ('Richard Bain', 5),
 ('Felipe Esparza', 5),
 ('Jeff Chang', 5),
 ('Joe Kilgallon', 5),
 ('Christina Pazsitzky', 5),
 ('Natasha Leggero', 5),
 ('Alex Grey', 5),
 ('C.J. Toledano', 5),
 ('Dan Deacon', 5),
 ('Soman Chainani', 5),
 ('Margaret Wappler', 5),
 ('Sam Tripoli', 5),
 ('June Diane Raphael', 5),
 ('Scott Aukerman', 5),
 ('Richard Bain Telethon', 5),
 ('Stu Stone', 5),
 ('Taylor Tomlinson', 5),
 ('Henry Rollins', 5),
 ('Gina Yashere', 5),
 ('Honus From Man Man/Mister Heavenly', 5),
 ('John Bush', 5),
 ('Andrew Sleighter', 5),
 ('Alton Brown', 5),
 ('Johnny Pemberton', 5),
 ('Dave Kloc', 5),
 ('Brian Babylon', 5),
 ('Mike Lebovitz', 5),
 ('Jim Festante', 5),
 ('Luka J

In [103]:
sorted(nx.closeness_centrality(G2).items(), key=operator.itemgetter(1), reverse=True)

[('Joe Rogan', 0.4037690934338425),
 ('Chris Hardwick', 0.38925224708357237),
 ('Pete Holmes', 0.38911829930412173),
 ('Ari Shaffir', 0.3756320820876241),
 ('Duncan Trussell', 0.3752857880374659),
 ('Bert Kreischer', 0.3722384784198976),
 ('Alison Rosen', 0.3689190168926267),
 ('Doug Benson', 0.3646755294370588),
 ('Cameron Esposito', 0.35953508090157565),
 ('Moshe Kasher', 0.35915443252399776),
 ('Jesse Thorn', 0.35674974585480423),
 ('Marc Maron', 0.3563624903704741),
 ('Neal Brennan', 0.35111264447127827),
 ('Bill Burr', 0.35014622398073286),
 ('Todd Glass', 0.3493649158942671),
 ('Kumail Nanjiani', 0.34786026797921793),
 ('Greg Fitzsimmons', 0.34649824656974565),
 ('Rhea Butcher', 0.34642747727814277),
 ('Shane Mauss', 0.3462271211811934),
 ('Jay Larson', 0.3458976276255863),
 ('Tim Ferriss', 0.34527565733672605),
 ('Joey Diaz', 0.34443429112938706),
 ('Nick Thune', 0.34439932318104904),
 ('Bryan Callen', 0.3443061100209757),
 ('Tom Rhodes', 0.34400351541373714),
 ('Eddie Pepitone'

In [104]:
sorted(nx.betweenness_centrality(G2).items(), key=operator.itemgetter(1), reverse=True)

[('Chris Hardwick', 0.16891794688948),
 ('Joe Rogan', 0.15746553212353442),
 ('Pete Holmes', 0.07372101157718748),
 ('Russ Roberts', 0.06963694256306538),
 ('Bill Maher', 0.06868278878601376),
 ('Jesse Thorn', 0.06495719353883492),
 ('Shane Mauss', 0.05808192268718319),
 ('Ari Shaffir', 0.05300152658627546),
 ('Tina Seelig', 0.05222600117519661),
 ('Duncan Trussell', 0.049715345864707425),
 ('Christopher Ryan', 0.049061785188923714),
 ('Brian Rose', 0.04378250523842645),
 ('Doug Benson', 0.04373743355491864),
 ('Cameron Esposito', 0.04234178007421004),
 ('Tim Ferriss', 0.039132292048503006),
 ('Tom Rhodes', 0.036058736881038965),
 ('Alison Rosen', 0.03507339755499149),
 ('Bert Kreischer', 0.033974475334388465),
 ('Raghu Markus', 0.03145893806415731),
 ('Lewis Howes', 0.02985487302686659),
 ('Joey Diaz', 0.02812164640109458),
 ('Rhea Butcher', 0.02781161900705254),
 ('Aubrey Marcus', 0.02765190153397566),
 ('Dave Rubin', 0.026328957167204778),
 ('Bobby Lee', 0.025699894927552938),
 ('Ku

In [105]:
sorted(nx.degree_centrality(G2).items(), key=operator.itemgetter(1), reverse=True)

[('Chris Hardwick', 0.08244079787756706),
 ('Joe Rogan', 0.05089908617470767),
 ('Cameron Esposito', 0.04706691559398644),
 ('Rhea Butcher', 0.046772133241623264),
 ('Jesse Thorn', 0.04441387442271789),
 ('Bill Maher', 0.03783040188660705),
 ('Pete Holmes', 0.036258229340670134),
 ('Russ Roberts', 0.03615996855654908),
 ('Ari Shaffir', 0.03154171170285939),
 ('Shane Mauss', 0.02996953915692247),
 ('Christopher Ryan', 0.028986931315711898),
 ('Tina Seelig', 0.02849562739510661),
 ('Doug Benson', 0.027611280338017095),
 ('Jordan Morris', 0.02751301955389604),
 ('Brian Rose', 0.026235629360322293),
 ('Alison Rosen', 0.025547803871474893),
 ('Bert Kreischer', 0.02535128230323278),
 ('Jay Larson', 0.023386066620811632),
 ('Ryan Sickler', 0.022796501916085288),
 ('Josh Horowitz', 0.02240345877960106),
 ('Tom Rhodes', 0.02142085093839049),
 ('Joey Diaz', 0.0209295470177852),
 ('Dave Shumka', 0.02063476466542203),
 ('Graham Clark', 0.02053650388130097),
 ('Kurt Metzger', 0.01847302741475877),


In [106]:
sorted(nx.eigenvector_centrality(G2).items(), key=operator.itemgetter(1), reverse=True)

[('Cameron Esposito', 0.22836049634698294),
 ('Rhea Butcher', 0.22358347863619965),
 ('Pete Holmes', 0.21883074638552671),
 ('Chris Hardwick', 0.1988964791358142),
 ('Jesse Thorn', 0.164443924548281),
 ('Jay Larson', 0.16180691691391166),
 ('Alison Rosen', 0.15019822989411152),
 ('Bert Kreischer', 0.1468441359858323),
 ('Ryan Sickler', 0.14596571122573201),
 ('Randy Sklar', 0.13863289163465473),
 ('Jason Sklar', 0.1361777179203563),
 ('Ari Shaffir', 0.13515950456422893),
 ('Jordan Morris', 0.13492536919811932),
 ('Joe Rogan', 0.13309963796042804),
 ('Doug Benson', 0.13114412634743652),
 ('John Roy', 0.11236271264745908),
 ('Todd Glass', 0.10170811368100836),
 ('Kumail Nanjiani', 0.10059597105395966),
 ('Dan St. Germain', 0.10023344002170591),
 ('Kurt Braunohler', 0.09631714471083096),
 ('Moshe Kasher', 0.09146069725763313),
 ('Sean Donnelly', 0.0911855150254345),
 ('Paul F. Tompkins', 0.08695594992943986),
 ('Shane Mauss', 0.08429806485755686),
 ('Tony Hinchcliffe', 0.0814144357409524)

In [107]:
sorted(nx.closeness_vitality(G2).items(), key=operator.itemgetter(1), reverse=True)

KeyboardInterrupt: 

In [None]:
node_attr = [pr, hubs, authorities]
nodes_df = pd.DataFrame.from_dict(pr, orient='index')
nodes_df.rename(columns = {0:'pr'}, inplace = True)
nodes_df['hub'] = hubs.values()
nodes_df['auth'] = authorities.values()
nodes_df['eccentricity'] = nx.eccentricity(G2).values()
nodes_df['closeness'] = nx.closeness_centrality(G2).values()
nodes_df['betweenness'] = nx.betweenness_centrality(G2).values()
nodes_df['degree_cen'] = nx.degree_centrality(G2).values()
nodes_df['eigen'] = nx.eigenvector_centrality(G2).values()
nodes_df

In [None]:
podcast_similarities = pd.DataFrame(columns=['podcast1', 'podcast2', 'score'])

for index1, row1 in podcast_info.iterrows():
    summ=0
    df1 = df[df['podcast'] == row1['Podcast Name']]
    guest_durations1 = df1.groupby(['guests'])['duration'].sum()
    guest_durations1 = guest_durations1.reset_index()
    for index2, row2 in podcast_info.iterrows():
        summ=0
        if(index1 >= index2):
            continue
        df2 = df[df['podcast'] == row2['Podcast Name']]
        guest_durations2 = df2.groupby(['guests'])['duration'].sum()
        guest_durations2 = guest_durations2.reset_index()
        for index3, row3 in guest_durations1.iterrows():
            for index4, row4 in guest_durations2.iterrows():
                if(row3['guests'] == row4['guests']):
                    summ += row3['duration']*row4['duration']
        print(row1['Podcast Name'], row2['Podcast Name'], summ)
        index3 = index1+index2
        podcast_similarities.loc[index3] = [row1['Podcast Name'], row2['Podcast Name'], summ]


podcast_similarities.to_csv('podcast_similarities.csv', sep='\t')

In [None]:
import datetime as dt
import time

testdate = dt.datetime(2008, 1, 1, 0, 0, 1)
#print(testdate < df['date'][0])
#dt.datetime.strptime(df['date'][0], '%Y-%m-%d %H:%M:%S')
#testdate < dt.datetime.strptime(df['date'][0], '%Y-%m-%d %H:%M:%S')

#base = dt.datetime.today()
# base = dt.datetime(2018, 4, 1)
# base
#date_list = [base - dt.monthdelta(x) for x in range(0, 144)]
#date_list = [dt.datetime(2008+x/12-x%12, x%12, 1) for x in range(0, 100)]



#dt.datetime.strptime(df['date'][0], '%Y-%m-%d %H:%M:%S')

# print(testdate > date_list[0])


date_list = [dt.datetime(int(2010+(x/12)-(x%12)/12), x%12+1, 1) for x in range(0, 101)]
date_list

In [None]:
dates = [(dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) for x in df['date']]


valid_dates = [(d < dt.datetime(2012, 1, 1)) for d in dates]
df1 = df[valid_dates]

# df1 = df[(d < dt.datetime(2012, 1, 1)) for d in dates]
len(df1)

split_hosts1 = splitDataFrameList(df1, 'hosts', ', ')
guest_durations1 = split_hosts1.groupby(['hosts', 'guests'])['duration'].sum()
guest_durations1 = guest_durations1.reset_index()

G1_1 = nx.from_pandas_dataframe(guest_durations1, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.DiGraph())
pr1 = nx.pagerank(G1_1, weight='duration')

In [None]:
pr1['Joe Rogan']

In [None]:
dates = [(dt.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')) for x in split_hosts['date']]

for date in date_list:
    valid_dates = [(d < date) for d in dates]
    df1 = split_hosts[valid_dates]
    #split_hosts1 = splitDataFrameList(df1, 'hosts', ', ')
    
    guest_durations1 = df1.groupby(['hosts', 'guests'])['duration'].sum()
    guest_durations1 = guest_durations1.reset_index()
    G1_1 = nx.from_pandas_dataframe(guest_durations1, 'guests', 'hosts', edge_attr=['duration'], create_using=nx.DiGraph())
    pr1 = nx.pagerank(G1_1, weight='duration')
    print(date, ' - ', pr1['Joe Rogan'])

In [6]:
import ast

df1 = df
for index1, row1 in podcast_info.iterrows():
    #df1 = df[df['podcast'] == row1['Podcast Name']].copy()
    #print(df1.is_copy)
    hosts = ast.literal_eval(row1['Hosts'])
    #print(hosts[0])
    for host in hosts:
        print(host)
        for index2, row2 in df1.iterrows():
            if(row2['guests'] == host):
                df1.drop(index=index2, inplace=True)

Joe Rogan
Duncan Trussell
Bert Kreischer
Brendan Schaub
Bryan Callen
Ari Shaffir
Russell Brand
Kevin Pereira
Chris Hardwick
Sam Harris
Tony Hinchcliffe
Brian Redban
Dave Rubin
Scott Aukerman
Ethan Klein
Marc Maron
Joey Diaz
Tom Segura
Christina Pazsitzky
Dan Harmon
Stefan Molyneux
Russ Roberts
Bill Maher
Pete Holmes
Anna Faris
Dax Shepard
Greg Fitzsimmons
Sam Tripoli
Alison Rosen
Christopher Ryan
Paul Scheer
June Diane Raphael
Jason Mantzoukas
Todd Glass
Jason Sklar
Randy Sklar
Doug Benson
Doug Benson
Howard Kremer
Kulap Vilaysack
Kumail Nanjiani
Emily V. Gordon
Graham Elwood
Chris Mancini
Neal Brennan
Moshe Kasher
Julian McCullough
Chris Cubas
Thomas Thakkar
Tommy McNamara
Iliza Shlesinger
Kurt Metzger
Sherrod Small
Todd Barry
Dan St. Germain
Sean Donnelly
Conner Moore
Steven Crowder
Nick Wiger
Mike Mitchell
Bill Simmons
Big Jay Oakerson
Luis J. Gomez
Dave Smith
Ari Shaffir
Sam Tripoli
Jayson Thibault
Hannibal Buress
Tait Fletcher
Steve Rannazzisi
Jim Rome
Jason Sklar
Randy Sklar
Bill

In [78]:
df1 = pd.read_csv('../reading_and_cleaning/guest_host_cleaned_podcasts.csv', sep='\t', index_col=0)

df1['attr'] = 'guest'

G3 = nx.from_pandas_dataframe(df1, 'guests', 'podcast', edge_attr=['date', 'duration', 'attr'], create_using=nx.Graph())


#hosts = ast.literal_eval(row1['Hosts'])
podcast_info_split = splitDataFrameList(podcast_info, 'Hosts', ', ')
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.rstrip("'") for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.rstrip('"') for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.rstrip(']') for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.rstrip("'") for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.lstrip('"') for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.lstrip('[') for g in podcast_info_split['Hosts']]
podcast_info_split['Hosts'] = podcast_info_split['Hosts'] = [g.lstrip("'") for g in podcast_info_split['Hosts']]

podcast_info_split['attr'] = 'host'
G4 = nx.from_pandas_dataframe(podcast_info_split, 'Podcast Name', 'Hosts', edge_attr=['attr'], create_using=nx.Graph())



G3.add_edges_from(G4.edges(data=True))

In [79]:
# node1 = 'Russ Roberts'
# node2 = 'Joe Rogan'
# path = nx.shortest_path(G3, node1, node2)
# path_length = nx.shortest_path_length(G3, node1, node2)
#print(node1 + ' was a guest on ' + path[1] + ' who also had as a guest ' + node2 + '.')
#print(node1 + ' is a host of ' + path[1] + ', who had as a guest ' + path[2] + ', who was also a guest on ' + path[3] + ', who also had as a guest ' + node2 + '.')
# path

def six_degrees(node1, node2):
    path = nx.shortest_path(G3, node1, node2)
    path_length = nx.shortest_path_length(G3, node1, node2)
    message = node1
    #message = '<a target="_blank" href="">' + node1 + '</a>'
    
    for step in range(path_length+1):
        print(step, path[step])
        if(step==0):
            continue
        if(step==1):
            if(G3[node1][path[1]]['attr'] == 'host'):
                message += ' is a host of '
            else:
                message += ' was a guest on '
            
            message += path[1]
            #message += '<a target="_blank" href="">' + path[1] + '</a>'
            continue
        if(step % 2 == 0):
            if(G3[path[step-1]][path[step]]['attr']=='guest'):
                if(step==2):
                    message += ', who had as a guest ' + path[step]
                    #message += ', who had as a guest ' + '<a target="_blank" href="">' + path[step] + '</a>'
                else:
                    message += ', who also had as a guest ' + path[step]
                    #message += ', who also had as a guest ' + '<a target="_blank" href="">' + path[step] + '</a>'
            if(G3[path[step-1]][path[step]]['attr']=='host'):
                message += ', which is hosted by ' + path[step]
                #message += ', which is hosted by ' + '<a target="_blank" href="">' + path[step] + '</a>'
        if(step % 2 == 1):
            message += ', who was a guest on ' + path[step]
            #message += ', who was a guest on ' + '<a target="_blank" href="">' + path[step] + '</a>'

    return message
                
six_degrees('President Barack Obama', 'Sam Harris')

0 President Barack Obama
1 Real Time with Bill Maher
2 Fareed Zakaria
3 Waking Up with Sam Harris
4 Sam Harris


'President Barack Obama was a guest on Real Time with Bill Maher, who had as a guest Fareed Zakaria, who was a guest on Waking Up with Sam Harris, which is hosted by Sam Harris'

In [109]:
six_degrees("Charlamagne Tha God", 'Senator John McCain')

0 Charlamagne Tha God
1 I AM RAPAPORT: STEREO PODCAST
2 Dr. Drew
3 ID10T with Chris Hardwick
4 Adam Carolla
5 The Rubin Report
6 Senator John McCain


'Charlamagne Tha God was a guest on I AM RAPAPORT: STEREO PODCAST, who had as a guest Dr. Drew, who was a guest on ID10T with Chris Hardwick, who also had as a guest Adam Carolla, who was a guest on The Rubin Report, who also had as a guest Senator John McCain'