In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
import datetime


# Awards Dataset

Cleaning awards.csv

In [2]:
awards = pd.read_csv('./data/awards.csv')#, sep = ',', header = 0, names = ['eventId', 'eventName', 'awardName', 'year','occurence', 'winAnnouncementTime', 'categoryName', 'nomeneeNote', 'name'])
awards = awards.dropna(subset = ['eventName', 'awardName', 'isPrimary', 'isWinner', 'isPerson', 'isTitle','isCompany'])

#fill the category name with the award name if the category name is NA
awards['categoryName'].fillna(awards['awardName'], inplace = True)
awards = awards[awards.isPrimary & ~(awards.isCompany)]

#select the events of interest for the network 
prestige_events = ['BAFTA Awards','Academy Awards','Cannes Film Festival', 'Sundance Film Festival', 'Venice Film Festival', 'Berlin International Film Festival', 'Locarno International Film Festival', 'Satellite Awards']

#remove some information given witht the name of the event
awards['eventName'] = awards.eventName.apply(lambda x : x.split(',')[0])
awards['categoryName'] = awards.categoryName.apply(lambda x : x.split(',')[0])
#as a result some event names are duplicated, so we attribute the same ID to these duplicated names
awards = awards.replace({'eventId': 'ev0000125'}, 'ev0000123').replace({'eventId': 'ev0000124'}, 'ev0000123').replace({'eventId': 'ev0001588'}, 'ev0000291')

  interactivity=interactivity, compiler=compiler, result=result)


# IMDB Movies

In [3]:
movies = pd.read_csv('./data/IMDb_movies.csv').dropna(subset = ['title', 'original_title', 'country'])
movies['genre'] = movies['genre'].apply(lambda x : x.split(', '))
movies.explode('genre')

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,Biography,70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,Crime,70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,Drama,70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,Drama,100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81269,tt9905412,Ottam,Ottam,2019,2019-03-08,Drama,120,India,Malayalam,Zam,...,"Nandu Anand, Roshan Ullas, Manikandan R. Achar...","Set in Trivandrum, the story of Ottam unfolds ...",7.8,510,INR 4000000,,$ 4791,,,
81270,tt9905462,Pengalila,Pengalila,2019,2019-03-08,Drama,111,India,Malayalam,T.V. Chandran,...,"Lal, Akshara Kishor, Iniya, Narain, Renji Pani...",An unusual bond between a sixty year old Dalit...,8.4,604,INR 10000000,,,,,
81271,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",,8.4,369,,,,,,
81272,tt9914286,Sokagin Çocuklari,Sokagin Çocuklari,2019,2019-03-15,Drama,98,Turkey,Turkish,Ahmet Faik Akinci,...,"Ahmet Faik Akinci, Belma Mamati, Metin Keçeci,...",,7.2,190,,,$ 2833,,,


A movie can be linked to many countries

In [4]:
movies['country'] = movies['country'].apply(lambda x : x.split(', '))
movies = movies.explode('country')

Some countries no longer exist or have a short name, so we will rename them to compare with the current geographical situation.

In [5]:
def country_clean(name):
    if name == 'UK':
        return 'United Kingdom'
    if name == 'Soviet Union':
        return 'Russia'
    if name == 'West Germany':
        return 'Germany'
#    if name == 'USA':
#        return 'United States of America'
    return name
movies['country'] = movies.country.apply(lambda x : country_clean(x))

# Creating dataset for Network

From the awards dataset we will create the json file for the bipartite graph

In [6]:
network_df = awards[awards.eventName.apply(lambda x : x in prestige_events)][['eventId', 'eventName', 'year', 'name', 'const']]
network_df.eventName.unique()

array(['Academy Awards', 'Sundance Film Festival',
       'Locarno International Film Festival',
       'Berlin International Film Festival', 'BAFTA Awards',
       'Cannes Film Festival', 'Venice Film Festival', 'Satellite Awards'],
      dtype=object)

In [7]:
network_df = network_df.merge(movies.iloc[:,[0,1,2,3,7]], left_on = 'const', right_on = 'imdb_title_id', suffixes = ('_award','_production'))

We will consider, for any year, only the 20 most represented countries.

In [8]:
network = network_df[['eventId', 'eventName', 'country', 'year_award']]
network = network.merge(network.groupby(['year_award', 'country']).size().reset_index().rename(columns={0: "counter"}).groupby(['year_award']).apply(lambda x: x.sort_values(["counter"], ascending = False).head(20)).droplevel(level = 0), on = ['year_award', 'country']).drop(['counter'], axis = 1).drop_duplicates()
network[network.year_award.apply(lambda x : x == 2019)]['country'].nunique()

20

Creating the nodes

In [9]:
network['cid'] = pd.factorize(network['country'])[0]
nodes_1 = network[['eventId', 'eventName']].drop_duplicates().rename(columns = {'eventId':'id', 'eventName':'name'})
print(nodes_1.head(20))
nodes_1['group'] = 'event'
nodes_1['radius'] = "2"
nodes_2 = network[['cid', 'country']].drop_duplicates().rename(columns = {'cid':'id', 'country':'name'})
nodes_2['group'] = 'country'
nodes_2['radius'] = "1"
nodes = pd.concat([nodes_1, nodes_2])

            id                                 name
0    ev0000003                       Academy Awards
81   ev0000147                 Cannes Film Festival
83   ev0000681                 Venice Film Festival
173  ev0000296                     Satellite Awards
179  ev0000400  Locarno International Film Festival
183  ev0000631               Sundance Film Festival
254  ev0000123                         BAFTA Awards
331  ev0000091   Berlin International Film Festival


Creating the edges

In [10]:
links = network[['eventId', 'cid', 'year_award']].rename(columns = {'eventId':'source', 'cid':'target'}).reset_index(level = 0).drop(['index'], axis=1)

Concatenating the nodes and the links and registering them in the same json file

In [11]:
import json

final_dict = {'nodes':nodes.to_dict('records'),'links':links.to_dict('records')}

with open('network_data.json', 'w') as fp:
    json.dump(final_dict, fp)

# Creating dataset for Sequence Sunbursts

Similarly for the events considered for the network, we will create the dataset for the sequence sunbursts

In [12]:
tops = awards[awards.eventName.apply(lambda x : x in prestige_events)]
first = tops.awardName.value_counts()
def rename_winner(x):
    result = "Nominees"
    if x:
        result = "Winners"
    return result
def rename_person(x):
    result = "Films"
    if x:
        result = "People"
    return result

tops.isWinner = tops.isWinner.apply(rename_winner)
tops.isPerson = tops.isPerson.apply(rename_person)
tops.awardName = tops.awardName.apply(lambda x : x.replace(' - ', ' '))
tops.categoryName = tops.categoryName.apply(lambda x : x.replace('-', ' ').replace('  ', ''))
second = tops.groupby(['eventName','year','awardName', 'isWinner','isPerson', 'categoryName']).size()
second

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


eventName             year  awardName               isWinner  isPerson  categoryName          
Academy Awards        1929  Honorary Award          Winners   Films     Honorary Award            2
                            Oscar                   Nominees  Films     Best Art Direction        2
                                                                        Best Cinematography       3
                                                                        Best Picture              4
                                                                        Best Writing              4
                                                                                                 ..
Venice Film Festival  2019  Venice Horizons Award   Winners   People    Best Actress              1
                                                                        Best Director             1
                            Verona Film Club Award  Winners   Films     Verona Film Club Award    1
     

In [14]:
def values_cleaning(line):
    result = line[:]
    if line[2] == line[-1]:
        result = line[:-1]
    return  tuple(map(lambda y : y if isinstance(y,int) else y.replace(',',' ') , result))

In [15]:
final = np.concatenate([np.array(['-'.join(i) for i in np.array([np.array(i[2:]) for i in list(map(values_cleaning,second.keys().values))])]).reshape(-1,1), second.values.reshape(-1,1), [[i[0]] for i in second.keys().values], [[i[1]] for i in second.keys().values]], axis = 1)

In [16]:
np.savetxt('sunburst_data.csv', final, delimiter=',', fmt = '%s', encoding = 'utf-8')