In [None]:
import numpy as np
import pandas as pd

import igraph as ig

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 10]

# Data preparation

Let's load the data. Pick one of the `cordis` datasets; you should have some folders under `data/raw`. Go check them.

In [None]:
projects = pd.read_csv('../data/raw/cordis-HORIZONprojects-csv/csv/project.csv', delimiter=';')
projects.head()

In [None]:
organisations = pd.read_csv('../data/raw/cordis-HORIZONprojects-csv/csv/organization.csv', delimiter=';')
organisations.head()

# Data analysis

In [None]:
organisations.columns

In [None]:
organisations.groupby('organisationID')['projectID'].count().sort_values(ascending=False)
# Same as organisations.organisationID.value_counts()

Let's check for null values in relevant columns

In [None]:
organisations[['organisationID', 'name', 'projectID', 'projectAcronym']].isna().sum()

In [None]:
organisations[['organisationID', 'name', 'projectID', 'projectAcronym']].describe(include='all')

Why are there `NaN` values in the report above? (Hint: check the manual of `describe`.)

**Task:** Which are the top-10 organisations with more projects?

**Task:** Which are the top-10 funded organisations? (aggregate over `ecContribution`)

**Task:** can you do both in one shot using the `agg()` function? Check the manual.

## Building a network

First, let's find the relevant columns we can use for contructing the network

In [None]:
organisations[['organisationID', 'name', 'country', 'projectID', 'projectAcronym']].values

First, let's try using the organisation name and the project acronym

In [None]:
G = ig.Graph.TupleList(
      edges=organisations[['name', 'projectAcronym']].values,
      directed=False
      )

In [None]:
is_bipartite, types = G.is_bipartite(return_types = True)
is_bipartite

Why is it not bipartite?

Maybe it is better to use both the ids for organisation and a projects

In [None]:
G = ig.Graph.TupleList(
      edges=organisations[['organisationID', 'projectID']].values,
      directed=False,
      vertex_name_attr='id',
      )

Is it ok now?

In [None]:
is_bipartite, types = G.is_bipartite(return_types = True)
is_bipartite

Ok, not let us prepare for the projection. Check node types.

In [None]:
G.vs[0]

We know from above that this is an organisation.

In [None]:
types[0]

Ok, organisation are typed as `False`. This defines our projection.

In [None]:
G_org_projection = G.bipartite_projection(types=types, which=False)

Since ids cannot be read easily, we can add a vertex attribute with organisations names

In [None]:
org_names = pd.DataFrame(G_org_projection.vs['id'], columns=['id'])
org_names = pd.merge(org_names, organisations[['organisationID', 'name']].drop_duplicates(), 
         left_on='id', 
         right_on='organisationID', 
         how='left')
org_names

In [None]:
G_org_projection.vs['name'] = org_names['name']


In [None]:
list(G_org_projection.vs)

Let's check that the organisation name assignment went correctly

In [None]:
organisations[organisations.organisationID == 999687821]

OK, all good.

Is the network connected?

In [None]:
G_org_projection.is_connected()

Analyse the connected components

In [None]:
components = G_org_projection.connected_components()

How many of them?

In [None]:
components[0]

In [None]:
components[1]

Let's check the obvious: is the node in another component also in the giant component?

Let's understand why this node is alone.

In [None]:
G_org_projection.vs[30]

Find the node in the main dataframe

Check out the project record here https://cordis.europa.eu/project/id/190173745

Now, let's check the giant component

In [None]:
H = G_org_projection.connected_components().giant()

In [None]:
H.summary()

What is the percentage of nodes involved in the giant component?

In [None]:
path_lengths = G_org_projection.path_length_hist()
print(path_lengths)

Let's focus on a single node

In [None]:
G_org_projection.neighborhood(1)

In [None]:
H = G_org_projection.induced_subgraph(G_org_projection.neighborhood(1))
H.summary()

In [None]:
H.vs['color'] = 'grey'
H.vs[0]['color'] = 'red'
fig, ax = plt.subplots()
ig.plot(H, target=ax, vertex_label=H.vs['name'])

What is its clustering coefficient?

Let's compute the same for the whole network

Let's calculate the degree for all the nodes

In [None]:
G_org_projection.vs['degree'] = G_org_projection.degree()

In [None]:
plt.hist(G_org_projection.vs['degree'], 50)
plt.yscale('log')
plt.xscale('log')

In [None]:
highest_degree = sorted(G_org_projection.vs, key=lambda v: v['degree'], reverse=True)

In [None]:
highest_degree[:5]

In [None]:
organisations[organisations.organisationID == 999984059].head()

**Task:** now build the network using country codes (Hint: beware of empty country codes!)

In [None]:
G = ig.Graph.TupleList(
      edges=organisations[organisations.country.notna()][['country', 'projectID']].values,
      directed=False,
      vertex_name_attr='id'
      )

In [None]:
is_bipartite, types = G.is_bipartite(return_types = True)
is_bipartite

In [None]:
G.vs[0]

In [None]:
types[0]

In [None]:
G_country_projection = G.bipartite_projection(types=types, which=False)

In [None]:
G_country_projection.vs['degree'] = G_country_projection.degree()

In [None]:
list(G_country_projection.vs)

In [None]:
fig, ax = plt.subplots()
ig.plot(G_country_projection, target=ax, vertex_label=G_country_projection.vs['id'])

# Preparing data for VOSviewer (for the next lesson)

It is possible to export data for VOSviewer to read.
It just needs two files, a `map` and a `network` with information about nodes and links.

More details available in the documentation, https://app.vosviewer.com/docs/file-types/map-and-network-file-type

In [None]:
nodes_df = pd.DataFrame.from_dict({attr: G_org_projection.vs[attr] for attr in G_org_projection.vs.attributes()})
nodes_df['label'] = nodes_df['name']
nodes_df = nodes_df.sort_values('id')
nodes_df[['id', 'label']].to_csv('../data/processed/map_vosviewer_orgs.txt', sep='\t', index=False)


edge_df = pd.DataFrame([(G_org_projection.vs[e.source]['id'], G_org_projection.vs[e.target]['id'], e['weight']) for e in G_org_projection.es],
                       columns=['source', 'target', 'weight'])
edge_df = edge_df.sort_values(['source', 'target'])
edge_df.to_csv('../data/processed/network_vosviewer_orgs.txt', sep='\t', index=False, header=False)

In [None]:
nodes_df = pd.DataFrame.from_dict({attr: G_country_projection.vs[attr] for attr in G_country_projection.vs.attributes()})
nodes_df['label'] = nodes_df['id']
nodes_df = nodes_df.sort_values('id')
nodes_df[['id', 'label']].to_csv('../data/processed/map_vosviewer_countries.txt', sep='\t', index=False)


edge_df = pd.DataFrame([(G_country_projection.vs[e.source]['id'], G_country_projection.vs[e.target]['id']) for e in G_country_projection.es],
                       columns=['source', 'target'])
edge_df = edge_df.sort_values(['source', 'target'])
edge_df.to_csv('../data/processed/network_vosviewer_countries.txt', sep='\t', index=False, header=False)

Let's also repeat everything to filter just italian organisations

In [None]:
G = ig.Graph.TupleList(
      edges=organisations[organisations.country == 'IT'][['organisationID', 'projectID']].values,
      directed=False,
      vertex_name_attr='id',
      )

is_bipartite, types = G.is_bipartite(return_types = True)

G_org_projection = G.bipartite_projection(types=types, which=False)

org_names = pd.DataFrame(G_org_projection.vs['id'], columns=['id'])
org_names = pd.merge(org_names, organisations[['organisationID', 'name']].drop_duplicates(), 
         left_on='id', 
         right_on='organisationID', 
         how='left')
org_names

G_org_projection.vs['name'] = org_names['name']

nodes_df = pd.DataFrame.from_dict({attr: G_org_projection.vs[attr] for attr in G_org_projection.vs.attributes()})
nodes_df['label'] = nodes_df['name']
nodes_df = nodes_df.sort_values('id')
nodes_df[['id', 'label']].to_csv('../data/processed/map_vosviewer_orgs_filtered.txt', sep='\t', index=False)


edge_df = pd.DataFrame([(G_org_projection.vs[e.source]['id'], G_org_projection.vs[e.target]['id'], e['weight']) for e in G_org_projection.es],
                       columns=['source', 'target', 'weight'])
edge_df = edge_df.sort_values(['source', 'target'])
edge_df.to_csv('../data/processed/network_vosviewer_orgs_filtered.txt', sep='\t', index=False, header=False)