In [42]:
import urllib.request
import json
import itertools
import pandas as pd
from tqdm import tqdm
import numpy as np
import ast

def get_data(link, save_as = None):
    response = urllib.request.urlopen(link)
    data = response.read().decode('utf-8')

    # print(json.dumps(json.loads(data), indent=2))

    if (save_as != None):
        with open(save_as, 'w') as f:
            f.write(json.dumps(json.loads(data), indent=2))
    return json.loads(data)

key = '8c0c786aba77a3ff00ebb47ca9fa3b8f'
temp = pd.DataFrame(columns=['actor1','actor2','movie'])

Get movie

In [2]:
link = "https://api.themoviedb.org/3/movie/315162?api_key=8c0c786aba77a3ff00ebb47ca9fa3b8f"
data = get_data(link)

Connection rule:
- Get all cast of top 10000 popular films
- For each movie:
    - `selected_cast` = 10% of casts by ascending order, based on `order` (5 at least)
    - Calculate weight of the movie, based on:
        - average vote score for movie
        - count of votes
        - profit (revenue - budget)
        - popularity
    - SAVE with each combination of `selected_cast` -> (id_cast1, id_cast2, weight)

- Calculate weight for each 2 unique id_cast i,j: `w(i,j)` equals average of all occurred films between i,j

- Save to existence matrix E
    - If no edge between node i and j: `E[i,j] = 0`
    - Otherwise, `E[i,j] = 1`

- Save to weight matrix W: `W[i,j] = w(i,j)`

In [43]:

# id = 315162

film_ids = []
for page in tqdm(range(1,11)):
    for year in range(2022,2023):
        for order in ('vote_average.desc', 'vote_count.desc', 'popularity.desc'):
            films = get_data(f"https://api.themoviedb.org/3/discover/movie?page={page}&sort_by={order}&year={year}&api_key={key}")
            film_ids.extend([f['id'] for f in films['results']])
film_ids = list(set(film_ids))

print(len(film_ids))



100%|██████████| 10/10 [00:14<00:00,  1.43s/it]

581





In [None]:
with open("film_ids.npy", "rb") as f:
    film_ids = np.load(f)

In [49]:
from hdfs import InsecureClient
client = InsecureClient('http://master-node:9870', user='hdfs')


In [48]:

for id in tqdm(film_ids[0:11]):
    credit_link = f"https://api.themoviedb.org/3/movie/{id}/credits?api_key={key}"
    try:
        credit = get_data(credit_link)
    except:
        print(f"Error of getting movie id {id}.")
        continue

    actors = [a for a in credit['cast'] if a['known_for_department'] == "Acting"]
    to_num = min(len(actors), max(5, round(len(actors)*0.2)))
    main_actors = actors[:to_num]
    # selected_cast = data['cast']

    try:
        film = get_data(f"https://api.themoviedb.org/3/movie/{id}?api_key={key}")
    except:
        print(f"Error of getting movie id {id}.")
        continue
    # weight = film['revenue'] - film['budget']

    for (a,b) in itertools.combinations(main_actors, 2):
        temp.loc[len(temp)] = sorted([a['id'], b['id']]) + [id]

    with client.write('/user/hadoop/movie.csv', encoding = 'utf-8') as writer:
        for i, r in temp.iterrows():
            test = ','.join([str(r[0]), str(r[1]), json.dumps(r[2])])
            current = writer.write(test)
        # print(client)


  0%|          | 0/11 [00:00<?, ?it/s]


TypeError: Object of type int64 is not JSON serializable

In [5]:
temp = pd.read_csv('temp1.csv')
temp['movie'] = temp['movie'].apply(lambda x: ast.literal_eval(x))

In [None]:
pd.concat(temp[temp.actor1 == 130], temp[temp.actor1 == 130])

TypeError: first argument must be an iterable of pandas objects, you passed an object of type "DataFrame"

In [13]:
# get all involved actor id
all_actor = pd.concat([temp['actor1'], temp['actor2']]).unique()

# get actor details, re-index
actors = pd.DataFrame()
for actor_id in tqdm(sorted(all_actor)):
    try:
        actor = get_data(f"https://api.themoviedb.org/3/person/{actor_id}?api_key={key}")
    except:
        print(f"Error of getting actor id {actor_id}.")
        continue

    actors = pd.concat([actors, pd.Series(actor).to_frame().T], ignore_index=True)
# save results
actors.to_csv('actors.csv', index=False)


 47%|████▋     | 46/98 [00:06<00:05,  8.69it/s]

Error of getting actor id 37822.


100%|██████████| 98/98 [00:13<00:00,  7.25it/s]


In [7]:
actors

Unnamed: 0,adult,also_known_as,biography,birthday,deathday,gender,homepage,id,imdb_id,known_for_department,name,place_of_birth,popularity,profile_path
0,False,"[Mark Hamil, Mark Richard Hamill, Марк Хэмилл,...","Mark Richard Hamill (born September 25, 1951) ...",1951-09-25,,2,,2,nm0000434,Acting,Mark Hamill,"Concord, California, USA",22.127,/9Wws35pCsT0KoZpiV4Gk5nbn9ZD.jpg
1,False,"[Гаррісон Форд, Харрисон Форд, هاريسون فورد, 해...",Legendary Hollywood Icon Harrison Ford was bor...,1942-07-13,,2,,3,nm0000148,Acting,Harrison Ford,"Chicago, Illinois, USA",28.762,/5M7oN3sznp99hWYQ9sX0xheswWX.jpg
2,False,"[Carrie Frances Fisher , Кэрри Фишер, Кэрри Фр...",Carrie Frances Fisher (21 October 1956 - 27 De...,1956-10-21,2016-12-27,1,https://carriefisher.com/,4,nm0000402,Acting,Carrie Fisher,"Beverly Hills, Los Angeles, California, USA",14.942,/rfJtncHewKVnHjqpIZvjn24ESeC.jpg
3,False,[Peter Wilton Cushing],"Peter Wilton Cushing, OBE (26 May 1913 – 11 A...",1913-05-26,1994-08-11,2,,5,nm0001088,Acting,Peter Cushing,"Kenley, Surrey, England, UK",12.669,/if5g03wn6uvHx7F6FxXHLebKc0q.jpg
4,False,"[Anthony Kingsley Daniels, Энтони Дэниелс ]",Anthony Daniels (/ˈæntəni/ AN-tə-nee; born 21 ...,1946-02-21,,2,http://www.anthonydaniels.com/,6,nm0000355,Acting,Anthony Daniels,"Salisbury, Wiltshire, England, UK",4.522,/7kR4kwXtvXtvrsxWeX3QLX5NS5V.jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
145888,False,[],,,,0,,3895828,,Acting,Saketh Sangaraju,,0.6,
145889,False,[],,,,0,,3897024,,Acting,Taylor Storm,,0.6,
145890,False,[],,,,0,,3897036,,Acting,Drew Morris,,0.6,
145891,False,[],,,,0,,3897075,nm9962597,Acting,Kyle DiMaggio,,0.6,/bht1C40XqvRRJLsNper6ePr9NEH.jpg


In [8]:
# merge weight (average for each actor1-actor2-movie) and save to matrices
temp['weight'] = temp['movie'].apply(lambda x: films[films['Id'] == x]['popularity'])

weight_temp = temp[['actor1','actor2','weight']].groupby(['actor1','actor2']).mean().reset_index()
print(weight_temp)
weight_temp.to_csv('popularity_weight.csv', index=False)

# m = np.zeros(shape=(len(actors), len(actors)))
# for i, t in tqdm(weight_temp.iterrows()):
#     index1 = actors[actors.id == t['actor1']].index.to_flat_index()[0]
#     index2 = actors[actors.id == t['actor2']].index.to_flat_index()[0]
#     m[index1, index2] = t['weight']
# print(m.shape)

# # save results
# with open('popularity_weight.npy', 'wb') as f:
#     np.save(f, m)

      actor1   actor2  weight
0         31       32  67.775
1         31       33  67.775
2         31       34  67.775
3         31       35  67.775
4         31     6751  67.775
..       ...      ...     ...
751  3817649  3817687  67.775
752  3817649  3817690  67.775
753  3817652  3817687  67.775
754  3817652  3817690  67.775
755  3817687  3817690  67.775

[756 rows x 3 columns]


In [41]:
import pandas as pd
weight_temp = pd.read_csv("/home/calibretaliation/LinkedIn-Analysis/popularity_weight.csv")
weight_temp['key'] = weight_temp['actor1'].astype(str) +","+ weight_temp['actor2'].astype(str)
weight_temp.drop(["actor1", 'actor2'],axis = 1, inplace = True)
weight_dict = dict(zip(weight_temp.key, weight_temp.weight))
print(weight_temp.head(5))
new_weight = pd.DataFrame(weight_dict.items(),columns = ['key', 'weight'])
new_weight[['actor1','actor2']] = new_weight['key'].str.split(",", expand = True)
new_weight.drop("key", axis = 1, inplace = True)
cols = ["actor1", 'actor2', 'weight']
new_weight = new_weight[cols]
print(new_weight.head(5))

   actor1  actor2  weight
0      31      32  67.775
1      31      33  67.775
2      31      34  67.775
3      31      35  67.775
4      31    6751  67.775
  actor1 actor2  weight
0     31     32  67.775
1     31     33  67.775
2     31     34  67.775
3     31     35  67.775
4     31   6751  67.775


In [3]:
import socket
from threading import *
import time
import pandas as pd
from hdfs import InsecureClient
client = InsecureClient('http://master-node:9870', user='hdfs')
    
with client.read("/user/hadoop/weight.csv", encoding = 'utf-8') as reader:
    weight_temp = pd.read_csv(reader)

host = "192.168.2.20"
port = 4321
print(host,"://",port)

s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

s.bind((host, port))
s.listen(1)
print('WAITING FOR CLIENT')

try:
    while True:
        conn, addr = s.accept()
        print('CONNECTED:', addr)
        try:
            for i, line in weight_temp.iterrows():
                data = f"{line[0]},{line[1]},{line[2]}" +"\n"
                conn.send(bytes(data, 'utf-8'))
                time.sleep(1)
                print(data)
            conn.close()
            print("CLOSED")
        except socket.error: pass
finally:
    s.close()


192.168.2.20 :// 4321
WAITING FOR CLIENT
CONNECTED: ('192.168.2.31', 34336)
0.0,31.0,32.0

1.0,31.0,33.0

2.0,31.0,34.0

3.0,31.0,35.0

4.0,31.0,6751.0

5.0,31.0,9640.0

6.0,31.0,36221.0

7.0,31.0,37821.0

8.0,31.0,37822.0

9.0,31.0,196855.0

10.0,31.0,204997.0

11.0,31.0,946856.0

12.0,31.0,951881.0

13.0,31.0,1019727.0

14.0,31.0,1215432.0

15.0,31.0,1808925.0

16.0,31.0,2020776.0

17.0,31.0,2140105.0

18.0,31.0,2392748.0

19.0,31.0,2939086.0

20.0,31.0,3052237.0

21.0,31.0,3817610.0

22.0,31.0,3817629.0

23.0,31.0,3817635.0

24.0,31.0,3817637.0

25.0,31.0,3817649.0

26.0,31.0,3817652.0

27.0,31.0,3817687.0

28.0,31.0,3817690.0

29.0,32.0,33.0

30.0,32.0,34.0

31.0,32.0,35.0

32.0,32.0,6751.0

33.0,32.0,9640.0

34.0,32.0,36221.0

35.0,32.0,37821.0

36.0,32.0,37822.0

37.0,32.0,196855.0

38.0,32.0,204997.0

39.0,32.0,946856.0

40.0,32.0,951881.0

41.0,32.0,1019727.0

42.0,32.0,1215432.0

43.0,32.0,1808925.0

44.0,32.0,2020776.0

45.0,32.0,2140105.0

46.0,32.0,2392748.0

47.0,32.0,2939

KeyboardInterrupt: 

In [2]:
import pygrank as pg

conductance = pg.Conductance().evaluate([1,2,3]) 


TypeError: object of type 'NoneType' has no len()

In [10]:
from pygrank import Graph

graph = Graph()
graph.add_edge("A", "B")
graph.add_edge("B", "C")
graph.add_edge("C", "D")
graph.add_edge("D", "E")
graph.add_edge("A", "C")
graph.add_edge("C", "E")
graph.add_edge("B", "E")

ranks = {"A":3, "B":2, "C":4, "D":4, "E":4}

signal = pg.to_signal(graph, ranks)

conductance = pg.Conductance(autofix=True).evaluate(signal) 
print(conductance)

7.0
