## Import data from IMDB

In [1]:
import pandas as pd

In [2]:
pricipals_df = pd.read_csv("title.pricipals.tsv", sep="\t")
pricipals_df

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N
...,...,...,...,...,...,...
50014871,tt9916880,4,nm10535738,actress,\N,"[""Horrid Henry""]"
50014872,tt9916880,5,nm0996406,director,principal director,\N
50014873,tt9916880,6,nm1482639,writer,\N,\N
50014874,tt9916880,7,nm2586970,writer,books,\N


In [3]:
basics_df = pd.read_csv("title.basics.tsv", sep="\t", low_memory=False)
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
...,...,...,...,...,...,...,...,...,...
8870617,tt9916848,tvEpisode,Episode #3.17,Episode #3.17,0,2010,\N,\N,"Action,Drama,Family"
8870618,tt9916850,tvEpisode,Episode #3.19,Episode #3.19,0,2010,\N,\N,"Action,Drama,Family"
8870619,tt9916852,tvEpisode,Episode #3.20,Episode #3.20,0,2010,\N,\N,"Action,Drama,Family"
8870620,tt9916856,short,The Wind,The Wind,0,2015,\N,27,Short


## Data cleaning

In [4]:
# Only keep category column is "actor"
pricipals_df = pricipals_df[pricipals_df['category'] == 'actor']
# drop category, job and characters columns
pricipals_df = pricipals_df.drop(['category', 'job', 'characters'], axis=1) 
pricipals_df

Unnamed: 0,tconst,ordering,nconst
11,tt0000005,1,nm0443482
12,tt0000005,2,nm0653042
16,tt0000007,1,nm0179163
17,tt0000007,2,nm0183947
21,tt0000008,1,nm0653028
...,...,...,...
50014851,tt9916852,1,nm5519557
50014852,tt9916852,2,nm8825009
50014862,tt9916856,3,nm10538646
50014868,tt9916880,1,nm1483166


In [5]:
# drop titleType, primaryTitle, originalTitle, isAdult, endYear, runtimeMinutes and genres columns
basics_df = basics_df.drop(['titleType', 'primaryTitle', 'originalTitle', 'isAdult', 'endYear', 'runtimeMinutes', 'genres'], axis=1) 
# Filter data by startYear is "1990"
basics_df = basics_df[basics_df['startYear'] == '1990']
basics_df

Unnamed: 0,tconst,startYear
58205,tt0059325,1990
58759,tt0059900,1990
63893,tt0065188,1990
67130,tt0068494,1990
73695,tt0075259,1990
...,...,...
8868538,tt9912346,1990
8868552,tt9912376,1990
8868554,tt9912380,1990
8870305,tt9916194,1990


In [6]:
# Merge two tables
df = pd.merge(left=pricipals_df,right=basics_df,on='tconst')
# Export data to "hypergraph_data.csv"
df.to_csv('hypergraph_data.csv',index = False)

In [7]:
import pandas as pd
df = pd.read_csv('hypergraph_data.csv')
df

Unnamed: 0,tconst,ordering,nconst,startYear
0,tt0059325,2,nm0753957,1990
1,tt0059325,3,nm2535970,1990
2,tt0059325,4,nm0537007,1990
3,tt0059900,1,nm1052658,1990
4,tt0059900,2,nm0937760,1990
...,...,...,...,...
78478,tt9907046,1,nm0048913,1990
78479,tt9907046,4,nm0198948,1990
78480,tt9907046,7,nm0810834,1990
78481,tt9907046,8,nm0905935,1990


## The COO representation

In [8]:
import numpy as np
import pandas as pd
import nwhy as nwhy
import copy
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"

# Select all values of the tconst column from the dataframe
tconst = copy.copy(df.iloc[:,0].values)
# Select all values of the nconst column from the dataframe
nconst = copy.copy(df.iloc[:,2].values)

tconst_dic = dict()
i = 0
j = 0
for item in tconst:
    if(not tconst_dic.__contains__(item)):
        tconst_dic[item] = i
        i += 1
    tconst[j] = tconst_dic[item]
    j += 1

nconst_dic = dict()
i = 0
j = 0
for item in nconst:
    if(not nconst_dic.__contains__(item)):
        nconst_dic[item] = i
        i += 1
    nconst[j] = nconst_dic[item]
    j += 1

weight = [1] * tconst.size

# Row of sparse matrix of the hypergraph (hyperedges)
row = np.array(tconst)
# Columns of sparse matrix of the hypergraph (vertices)
col = np.array(nconst)
# Weights of sparse matrix of the hypergraph
data = np.array(weight)

## Create the hypergraph

In [9]:
# Create the hypergraph 
h = nwhy.NWHypergraph(row, col, data)
print('Hypergraph created successfully!', h)

Hypergraph created successfully! <nwhy.NWHypergraph object at 0x7f2ff0174e70>


## NWHypergraph class methods:

In [10]:
# NWHypergraph class methods:

# print('-- collapsing edges without returning equal class')
# equal_class = h.collapse_edges()
# print(equal_class)

# print('-- collapsing nodes without returning equal class')
# equal_class = h.collapse_nodes()
# print(equal_class)

# print('-- collapsing nodes and edges without returning equal class')
# equal_class = h.collapse_nodes_and_edges()
# print(equal_class)

# print('-- collapsing edges with returning equal class')
# equal_class = h.collapse_edges(return_equivalence_class=True)
# print(equal_class)

# print('-- collapsing nodes with returning equal class')
# equal_class = h.collapse_nodes(return_equivalence_class=True)
# print(equal_class)

# print('-- collapsing nodes and edges with returning equal class')
# equal_class = h.collapse_nodes_and_edges(return_equivalence_class=True)
# print(equal_class)

# print('-- edge_size_dist()')
# equal_class = h.edge_size_dist()
# print(equal_class)

# print('-- node_size_dist()')
# equal_class = h.node_size_dist()
# print(equal_class)

# print('-- edge_incidence(edge)')
# equal_class = h.edge_incidence(666)
# print(equal_class)

# print('-- node_incidence(node)')
# equal_class = h.node_incidence(666)
# print(equal_class)

# print('-- degree(node, min_size=1, max_size=None)')
# equal_class = h.degree(666, min_size=1, max_size=None)
# print(equal_class)

# print('-- size(edge, min_degree=1, max_degree=None)')
# equal_class = h.size(666, min_degree=1, max_degree=None)
# print(equal_class)

# print('-- dim(edge)')
# equal_class = h.dim(666)
# print(equal_class)

# print('-- number_of_nodes()')
# equal_class = h.number_of_nodes()
# print(equal_class)

# print('-- number_of_edges()')
# equal_class = h.number_of_edges()
# print(equal_class)

# print('-- singletons()')
# equal_class = h.singletons()
# print(equal_class)

# print('-- toplexes()')
# equal_class = h.toplexes()
# print(equal_class)

# print('-- s_linegraph(s=1, edges=True)')
# equal_class = h.s_linegraph(s=1, edges=True)
# print(equal_class)

# print('-- s_linegraphs(l, edges=True)')
# equal_class = h.s_linegraphs([1,2,3,4,5,6], edges=True)
# print(equal_class)

-- number_of_nodes()
21160
