In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx

# Building network

In [2]:
df = pd.read_csv('./data/dataset_sentiment.csv', 
                 converters={"Links": lambda x: x.strip("[]").replace("'","").split(", ")}) # from str to list
df = df[['Page', 'Category', 'Links', 'Text', 'Score']]

df.head()

Unnamed: 0,Page,Category,Links,Text,Score
0,Field trip,Education,[American Association of School Administrators...,['A field trip or excursion is a journey by a ...,0.9837
1,Education in emergencies and conflict areas,Education,"[Adult, Armed conflict, Asylum seeker, Banglad...","[""Education in emergencies and conflict areas ...",-0.9356
2,Ability grouping,Education,"[Academic achievement, Classroom, Cluster grou...",['Ability grouping is the educational practice...,0.9931
3,Classwide Peer Tutoring,Education,"[ADHD, Active learning, Cardio-pulmonary resus...",['Classwide Peer Tutoring (CWPT) is a variatio...,0.996
4,Flexible learning,Education,"[Australia, Blended learning, Distance educati...",['Flexible learning is a principle of practice...,0.7579


In [3]:
df.shape

(14928, 5)

In [4]:
# 'Links' is a list of pages
df.iloc[0,2]

['American Association of School Administrators',
 'Amusement parks',
 'Australia',
 'Bangladesh',
 'Businesses',
 'Calgary Aero-Space Museum',
 'Calgary Board of Education',
 'Calgary City Hall',
 'Calgary Science Centre',
 'Calgary Stampede',
 'Calgary Zoo',
 'Canada Olympic Park',
 'Charles Darwin',
 'Chicago',
 'Community agencies',
 'Cross Conservation Area',
 'Curriculum',
 'Doi (identifier)',
 'Education',
 'Education Next',
 'Education in Japan',
 'Europe',
 'Excursion',
 'Experiential learning',
 'Factories',
 'Field Museum of Natural History',
 'Field trip (disambiguation)',
 'Fire Training Academy',
 'Geographical',
 'Geological',
 'Glenbow Museum and Archives',
 'Government agency',
 'Grand Tour',
 'Guided tour',
 'Hospital',
 'ISSN (identifier)',
 'Inglewood Bird Sanctuary',
 'Ireland',
 'Jay P. Greene',
 'Kyoto',
 'Kyoto',
 'Leaf Group',
 'Lecture',
 'Middle school',
 'Museum education',
 'Nagasaki',
 'Nagasaki',
 'Nara',
 'Nara',
 'Natural sciences',
 'Nature centers',
 

In [5]:
pages = df['Page'].to_list()

len(pages)

14928

In [6]:
# Removing categories and links to pages not in 'Page'
links_clean = [] # list of lists (rows)

for index,row in df.iterrows():
    links_per_page = []
    for e in row['Links']: 
        if e in pages:  # if the page is in the column 'Page'
            links_per_page.append(e)  # then keep it
    links_clean.append(links_per_page)

In [7]:
links_clean

[['Curriculum',
  'Experiential learning',
  'Government agency',
  'Lecture',
  'Middle school',
  'Student',
  'Teacher',
  'Zoo'],
 ['Asylum seeker',
  'Child',
  'Children in the military',
  'Curriculum',
  'Human rights',
  'International human rights law',
  'International law',
  'Internet',
  'Learning',
  'Non-governmental organization',
  'Open educational resources',
  'Parent',
  'Peacebuilding',
  'Refugee',
  'Refugee children',
  'Right to education',
  'Secondary school',
  'Student',
  'Teacher'],
 ['Academic achievement',
  'Gifted education',
  'Lesson',
  'Student',
  'Tracking (education)'],
 ['Active learning',
  'Curriculum',
  'Peer-mediated instruction',
  'United States Department of Education',
  'What Works Clearinghouse'],
 ['Blended learning', 'M-learning', 'Networked learning'],
 ['ERIC'],
 [],
 ['Computer-supported collaborative learning',
  'Cooperative learning',
  'Educational technology',
  'Learning',
  'Learning by teaching',
  'Learning theory (e

In [8]:
df['Links'] = links_clean

In [9]:
# Creating network

col_from = [] # Link from page...
col_to = []   # ...to page.

for index, row in df.iterrows(): # iterating over rows
    for e in row['Links']:
        col_from.append(row['Page'])
        col_to.append(e)

print(len(col_from))
print(len(col_to))

165959
165959


In [10]:
# Saving dataframe

df_network = pd.DataFrame()
df_network['from'] = col_from
df_network['to'] = col_to

df_network

Unnamed: 0,from,to
0,Field trip,Curriculum
1,Field trip,Experiential learning
2,Field trip,Government agency
3,Field trip,Lecture
4,Field trip,Middle school
...,...,...
165954,The Unprocessed Child,Unschooling
165955,Virtual visitation,Child custody
165956,Virtual visitation,Parenting
165957,"Where Are We Going, Dad? (film)",Parenting


In [11]:
# Creating networkx graph
DG = nx.DiGraph()

# Adding nodes
for index,row in df.iterrows():
    # Adding node and attributes
    DG.add_node(row['Page'], category=row['Category'], score=row['Score'])

In [12]:
# Adding edges
for index, row in df_network.iterrows():
    DG.add_edge(row['from'],row['to'])

In [13]:
print("Total number of nodes: ", int(DG.number_of_nodes()))
print("Total number of edges: ", int(DG.number_of_edges()))

Total number of nodes:  14928
Total number of edges:  165882


In [14]:
density = nx.density(DG)
print("Network density:", density)

Network density: 0.0007444321205644533


In [15]:
# Error computing diagram: the graph is not strongly connected

# diameter = nx.diameter(DG)
# print("Diameter:", diameter)

# Getting largest connected component

In [16]:
# Taking largest connected component

gcc = sorted(nx.weakly_connected_components(DG), key=len, reverse=True) # giant connected component
DG = DG.subgraph(gcc[0])


In [17]:
print("Total number of nodes: ", int(DG.number_of_nodes()))
print("Total number of edges: ", int(DG.number_of_edges()))

Total number of nodes:  12762
Total number of edges:  165568


# Exporting

In [20]:
# Exporting Gephi

nx.write_gexf(DG,'./data/network_connected.gexf')

In [21]:
# Checking export

DG = nx.read_gexf('./data/network_connected.gexf')

print("Total number of nodes: ", int(DG.number_of_nodes()))
print("Total number of edges: ", int(DG.number_of_edges()))

Total number of nodes:  12762
Total number of edges:  165568
