In [33]:
import pandas as pd
import re
import numpy as np
import pickle
import scipy
import os

# Preprocessing

In [None]:
def remove_extra(s):
    return(re.sub(r'\([^\d]*\)|\{.*\}|\'|\"', "", s).strip())

In [None]:
def get_non_duplicated(artist_dataframe):
    artist_counts = artist_dataframe[['movies','artist']].groupby('artist',as_index=False).agg(['count'])
    artist_counts.columns = [''.join(col).strip() for col in artist_counts.columns.values]
    artist_counts2 = artist_counts.reset_index()
    print('Only',artist_counts2['artist'].count(),' unique names')
    
    artist_counts2  = artist_counts2.loc[artist_counts2['moviescount'] >1]
    duplicate_artists = list(artist_counts2['artist'].values)
    #print(len(duplicate_artists),' repeated names')
    duplicated_artist_df  = artist_dataframe.loc[artist_dataframe['artist'].isin(duplicate_artists)]
    non_duplicated_artist_df  = artist_dataframe.loc[~artist_dataframe['artist'].isin(duplicate_artists)]
    print(non_duplicated_artist_df['movies'].count(),' non repeated names')
    fixed_duplicated_artist_df = duplicated_artist_df.groupby('artist').agg({'movies':'sum'})
    fixed_duplicated_artist_df['movies'] = fixed_duplicated_artist_df['movies'].apply(lambda x: list(set(x)))
    print(fixed_duplicated_artist_df['movies'].count(),' responsible for duplicates')
    non_duplicated_artist_df.append(fixed_duplicated_artist_df)
    return non_duplicated_artist_df

    

In [None]:
def get_artist_data(file_name):
    artist_list = []
    artist_movies_list = []
    i = 0
    with open(file_name,'rb') as f:
        lines = f.readlines()
        for line in lines:
            i+=1
            if(i%100000==0):
                print(i)
            line= line.decode('utf-8','ignore').encode("utf-8")
            line_split = line.decode().split('\t\t')
            artist = line_split[0]
            movies = []
            if(len(line_split)>1):
                movies = set(map(remove_extra, line_split[1:]))
                movies =[m for m in movies if(len(m))>0]
            artist_list.append(artist)
            artist_movies_list.append(list(movies))     
    print(len(artist_list)," Lines found")
    artist_dataframe = pd.DataFrame(list(zip(artist_list, artist_movies_list)),columns=['artist','movies'])
    return get_non_duplicated(artist_dataframe)
    



In [None]:
non_duplicated_actor_df = get_artist_data('data/actor_movies.txt')
non_duplicated_actress_df = get_artist_data('data/actress_movies.txt')

In [None]:
non_duplicated_artist_df = get_non_duplicated(non_duplicated_actor_df.append(non_duplicated_actress_df))
non_duplicated_artist_df['count'] = non_duplicated_artist_df['movies'].apply(lambda x: len(x))
non_duplicated_artist_df.head()

In [None]:
non_duplicated_artist_df.count()

In [None]:
final_df = non_duplicated_artist_df.loc[non_duplicated_artist_df['count'] >=10]
final_df.count()

In [None]:
import pickle
import pandas as pd
import re
import numpy as np
import pickle
import scipy
#final_df = pickle.load(open('final.pickle','rb'))
artists = final_df['artist'].values
num_artists = len(artists)#len(artists)

In [None]:
movies_list = final_df['movies'].values
import itertools
merged = list(itertools.chain(*movies_list))
movies = list(set(merged)) 
num_movies =len(movies)

In [None]:
movies_list =[set(movies) for movies in movies_list]

In [None]:
artist_id_map = {}
for i in range(len(artists)):
    artist_id_map[artists[i]] = i
    movie_id_map = {}
for i in range(len(movies)):
    movie_id_map[movies[i]] = i

In [None]:
def get_intersections(e1,e2):
    return (len(set(e1)&e2),len(e1))
vector_get_intersections =np.vectorize(get_intersections)

In [None]:
final_df.head()

In [None]:
if os.path.exists('edge_list.txt'):
    x = input('Path Exist. Want to create a new?y/n')
    if(x=='y'):
        os.remove('edge_list.txt')

In [None]:
i = 0
for i in range(num_artists-1):
    i+=1
    if(i%1024==0):
        print(i)
    x= vector_get_intersections(movies_list[i+1:],movies_list[i])
    i_len = len(movies_list[i])
    neigh = [(j,x[0][j-i-1],x[1][j-i-1]) for j in range(i+1,num_artists) if x[0][j-i-1]>0 and j!=i]
    with open('edge_list.txt','a+')as  ef:
        for n in neigh:
            ef.write("%d\t%d\t%f\n"%(i,n[0],n[1]*1.0/i_len))
            ef.write("%d\t%d\t%f\n"%(n[0],i,n[1]*1.0/n[2]))       

In [38]:
ten_artists =['Cruise, Tom','Watson, Emma (II)','Clooney, George','Hanks, Tom','Johnson, Dwayne (I)','Depp, Johnny','Smith, Will (I)','Streep, Meryl','DiCaprio, Leonardo','Pitt, Brad']
ten_artists_id = [artist_id_map[x] for x in ten_artists]

In [37]:
df = pd.read_table('edge_list.txt',header=None)
df.columns = ['v1','v2','weight']
df.head()

Unnamed: 0,v1,v2,weight
0,1,9760,0.6
1,9760,1,0.571429
2,1,17056,0.05
3,17056,1,0.01
4,1,21507,0.05


In [86]:
for artist in ten_artists_id:
    df_top = df.loc[df.v1.isin([artist])]
    print(df_top['v1'].count())
    weight = df_top.groupby('v1').agg({'weight':max}).values[0][0]
    print(weight)
    x = df_top['v2'].loc[(df_top.v1.isin([artist])) & (df_top.weight.isin([weight]))].values
    print(artists[artist])
    print([artists[a] for a in x])

1651
0.174603
Cruise, Tom
['Kidman, Nicole']
453
0.52
Watson, Emma (II)
['Radcliffe, Daniel']
1572
0.119403
Clooney, George
['Damon, Matt']
2062
0.101266
Hanks, Tom
['Allen, Tim (I)']
1354
0.20512800000000003
Johnson, Dwayne (I)
['Austin, Steve (IV)', 'Calaway, Mark', 'Levesque, Paul (I)']
2143
0.081633
Depp, Johnny
['Bonham Carter, Helena']
1317
0.122449
Smith, Will (I)
['Foster, Darrell']
1594
0.061855999999999994
Streep, Meryl
['De Niro, Robert', 'Kline, Kevin (I)']
1300
0.10204099999999999
DiCaprio, Leonardo
['Scorsese, Martin']
1738
0.098592
Pitt, Brad
['Clooney, George']
