### Network Visualization Data Preparation

In [2]:
import pandas as pd
import numpy as np
import networkx as nx
%matplotlib inline
import matplotlib.pyplot as plt

##### Import the *main* dataset - Harry Potter Screen Time statistics from IMDB

In [2]:
main = pd.read_csv('youreawizardharry.csv')
main.head()

Unnamed: 0,Actor,Movie,Screen Time
0,Harry Potter,The Sorcerer's Stone,72:45
1,Ron Weasley,The Sorcerer's Stone,28:15
2,Hermione Granger,The Sorcerer's Stone,23:15
3,Rubeus Hagrid,The Sorcerer's Stone,16:30
4,Professor Albus Dumbledore,The Sorcerer's Stone,9:45


##### Create the Nodes file

In [3]:
# take a unique list of the two columns 
movies = main.Movie.unique()
actors = main.Actor.unique()

# concatenate the two list into one array
label = np.concatenate([movies, actors])

# create the nodes dataframe from the label array
nodes = pd.DataFrame(label, columns = ["Label"])

# add the ID and Nodes unique identifier columns
nodes["ID"] = range(1, 1 + len(nodes))
nodes["Nodes"] = range(1, 1 + len(nodes))

nodes = nodes[["ID", "Nodes", "Label"]]

nodes

Unnamed: 0,ID,Nodes,Label
0,1,1,The Sorcerer's Stone
1,2,2,The Chamber of Secrets
2,3,3,The Prisoner of Azkaban
3,4,4,The Goblet of Fire
4,5,5,The Order of the Phoenix
...,...,...,...
131,132,132,Gellert Grindelwald
132,133,133,Aberforth Dumbledore
133,134,134,Bogrod
134,135,135,Helena Ravenclaw


##### Export the Nodes file as a CSV

In [4]:
# uncomment the line below
nodes.to_csv('nodes.csv', index = False)

##### Create the Edges file

In [5]:
# add source and target columns to main

main["Source"] = main.Movie.map(nodes.set_index('Label')['Nodes'].to_dict())
main["Target"] = main.Actor.map(nodes.set_index('Label')['Nodes'].to_dict())

main

Unnamed: 0,Actor,Movie,Screen Time,Source,Target
0,Harry Potter,The Sorcerer's Stone,72:45,1,9
1,Ron Weasley,The Sorcerer's Stone,28:15,1,10
2,Hermione Granger,The Sorcerer's Stone,23:15,1,11
3,Rubeus Hagrid,The Sorcerer's Stone,16:30,1,12
4,Professor Albus Dumbledore,The Sorcerer's Stone,9:45,1,13
...,...,...,...,...,...
396,Percy Weasley,The Deathly Hallows: Part 2,:30,8,28
397,Professor Sybil Trelawney,The Deathly Hallows: Part 2,:15,8,69
398,Madame Poppy Pomfrey,The Deathly Hallows: Part 2,:15,8,62
399,Peter Pettigrew,The Deathly Hallows: Part 2,:15,8,71


In [6]:
# 1. grab the Source and Target columns from main
edges = main[["Source", "Target"]]

# 2. add the Type and Directed columns
edges.insert(2, 'Type', "Directed")
edges.insert(3, 'Weight', 1)
edges

Unnamed: 0,Source,Target,Type,Weight
0,1,9,Directed,1
1,1,10,Directed,1
2,1,11,Directed,1
3,1,12,Directed,1
4,1,13,Directed,1
...,...,...,...,...
396,8,28,Directed,1
397,8,69,Directed,1
398,8,62,Directed,1
399,8,71,Directed,1


##### Export the Edges file as a CSV

In [7]:
# uncomment the line below
edges.to_csv("edges.csv", index = False)

##### Export the Main file as a CSV

In [8]:
# uncomment the line below
main.to_csv("main.csv", index = False)

#### Bring the data into Gephi -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------

#### Download the Network Data

In [9]:
network = pd.read_excel("network.xlsx")
# select only the necessary columns
network = network[["label", "value", 'x', "y"]]
network

FileNotFoundError: [Errno 2] No such file or directory: 'network.xlsx'

#### Create the Source and Target dataframes
#### Merge the Source and Target dataframes as the new Main

In [15]:
target = main.merge(network[['x', 'y', 'value']], how = 'left',
          left_on = 'Target', right_on = 'value').drop(columns = ['value'])

source = main.merge(network[['x', 'y', 'value']], how = 'left',
          left_on = 'Source', right_on = 'value').drop(columns = ['value'])

main = target.append(source)
main

#### Save Main as a CSV

In [None]:
# uncomment the line below
# main.to_csv("main.csv", index = False)