In [1]:
!pip3 install folium
!pip3 install networkx
!pip3 install haversine
!pip3 install numpy
!pip3 install pandas

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable


In [3]:
import sqlite3
import networkx as nx
import folium
from haversine import haversine
import storage
from folium.plugins import MarkerCluster
from folium.plugins import HeatMap
import matplotlib.pyplot as plt

In [31]:
departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()

## Absolute Department Heatmap

A location heatmap - measuring the quantity of departments detected in a given area

In [32]:
def create_heatmap(departments, article_dept_links):
    # Initialize a folium map centered around the average latitude and longitude
    latitudes = [dept[2] for dept in departments]
    longitudes = [dept[3] for dept in departments]
    map_center = [sum(latitudes) / len(latitudes), sum(longitudes) / len(longitudes)]
    
    folium_map = folium.Map(location=map_center, zoom_start=2)

    # Create a dictionary to count the number of connections for each department
    dept_connections = {dept[0]: 0 for dept in departments}
    for link in article_dept_links:
        dept1_id, dept2_id, _, _ = link
        dept_connections[dept1_id] += 1
        dept_connections[dept2_id] += 1

    # Create heatmap data
    heat_data = []
    for dept in departments:
        dept_id, _, lat, lon = dept
        if lat != 0.0 and lon != 0.0:  # Ensure valid coordinates
            intensity = dept_connections[dept_id]
            heat_data.append([lat, lon, intensity])

    # Add the heatmap layer to the map
    HeatMap(heat_data, max_zoom=15).add_to(folium_map)

    return folium_map


In [33]:
departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()

# Create the heatmap
heatmap = create_heatmap(departments, article_dept_links)

# Display the heatmap in a Jupyter notebook
heatmap.save("department_heatmap.html")
from IPython.display import IFrame
IFrame('department_heatmap.html', width=1000, height=600)

## Absolute Network Visualization

In [34]:
import folium
from IPython.display import display, IFrame
import storage
from folium.plugins import MarkerCluster
import random

# Function to generate a random color for each journal
def generate_random_color():
    return "#{:06x}".format(random.randint(0, 0xFFFFFF))

# Create NetworkX graph from department data and shared articles
def create_graph(departments, article_dept_links):
    G = nx.Graph()
    
    # Add departments as nodes
    for dept in departments:
        dept_id, name, lat, lon = dept
        G.add_node(dept_id, name=name, lat=lat, lon=lon)
    
    # Add edges based on shared articles
    for link in article_dept_links:
        dept1_id, dept2_id, article_id, journal_title = link
        if G.has_edge(dept1_id, dept2_id):
            G[dept1_id][dept2_id]['weight'] += 1
            G[dept1_id][dept2_id]['articles'].append((article_id, journal_title))
        else:
            G.add_edge(dept1_id, dept2_id, weight=1, articles=[(article_id, journal_title)], journal=journal_title)
    
    return G

# Create Folium map with department connections and metrics
def create_map(G):
    # Create base map with clustering
    m = folium.Map(location=[0, 0], zoom_start=2)
    marker_cluster = MarkerCluster().add_to(m)
    
    # Create a color mapping for each journal
    journal_colors = {}
    
    # Add nodes (departments) to the map
    for node in G.nodes(data=True):
        node_id = node[0]
        node_data = node[1]
        name = node_data['name']
        lat = node_data['lat']
        lon = node_data['lon']
        
        popup_text = f"Department: {name}"
        
        folium.Marker(
            location=[lat, lon],
            popup=popup_text,
            icon=folium.Icon(color='blue')
        ).add_to(marker_cluster)
    
    # Add edges (connections) to the map with colors based on journal
    for edge in G.edges(data=True):
        dept1 = G.nodes[edge[0]]
        dept2 = G.nodes[edge[1]]
        points = [(dept1['lat'], dept1['lon']), (dept2['lat'], dept2['lon'])]
        
        journal_title = edge[2]['journal']
        
        # Assign a color to each journal
        if journal_title not in journal_colors:
            journal_colors[journal_title] = generate_random_color()
        
        # Get the color for this journal
        color = journal_colors[journal_title]
        
        folium.PolyLine(points, color=color, weight=edge[2]['weight'], tooltip=journal_title).add_to(m)
    
    return m

# Main execution
departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()
G = create_graph(departments, article_dept_links)
map_ = create_map(G)

# Save the map to an HTML file
map_.save("department_network_map_colored.html")

# Display the map directly in the notebook
display(IFrame('department_network_map_colored.html', width=800, height=600))


## Absolute Department Prevalence

The absolute number of connections. 

In [65]:
# Function to rank departments based on prevalence
def rank_departments_by_prevalence(departments, article_dept_links):
    # Create a dictionary to count the number of connections for each department
    dept_connections = {dept[0]: 0 for dept in departments}
    
    # Count the number of connections for each department
    for link in article_dept_links:
        dept1_id, dept2_id, _, _ = link
        dept_connections[dept1_id] += 1
        dept_connections[dept2_id] += 1

    # Create a list of tuples (department_id, department_name, connection_count)
    ranked_departments = [(dept[0], dept[1], dept_connections[dept[0]]) for dept in departments]
    
    # Sort the list based on connection_count in descending order
    ranked_departments.sort(key=lambda x: x[2], reverse=True)
    
    return ranked_departments

# Main execution
departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()

# Rank the departments based on prevalence
ranked_departments = rank_departments_by_prevalence(departments, article_dept_links)

# Print the ranked departments
print("Ranked Departments by Prevalence:")
#for rank, (dept_id, dept_name, connection_count) in enumerate(ranked_departments, 1):
    #print(f"{rank}. {dept_name} (ID: {dept_id}) - Connections: {connection_count}")
for i in range(0, 10):
    print(str(i+1), df_ranked_departments.Department.iloc[i])
    print("----")

Ranked Departments by Prevalence:
1 Department of Microbiology and Immunology, University of Melbourne, Peter Doherty Institute for Infection and Immunity, Melbourne, VIC 3000, Australia.  / Department of Microbiology and Immunology, University of Melbourne, Peter Doherty Institute for Infection and Immunity, Melbourne, VIC 3000, Australia. / Department of Anatomy and Physiology, University of Melbourne, Parkville, VIC 3010, Australia. / Department of Microbiology and Immunology, University of Melbourne, Peter Doherty Institute for Infection and Immunity, Melbourne, VIC 3000, Australia
2 Regeneron Genetics Center, Tarrytown, NY, USA.
3 AncestryDNA, Lehi, UT, USA.
4 Department of HostMicrobe Interactions, St. Jude Children's Research Hospital, Memphis, TN 38105, USA / Department of HostMicrobe Interactions, St. Jude Children's Research Hospital, Memphis, TN 38105, USA. / Department of Infectious Diseases, St. Jude Children's Research Hospital, Memphis, TN 38105, USA. / Center for Infect

## Averaged Department Prevalence

The average number of connections per paper, per department (average collaboration)

In [64]:
import pandas as pd
from haversine import haversine
import storage

def merge_departments_by_proximity(departments, distance_threshold=1.0):
    merged_departments = []
    visited = set()
    
    for i, dept1 in enumerate(departments):
        if dept1[0] in visited:
            continue
        
        merged_group = [dept1]
        visited.add(dept1[0])
        
        for j, dept2 in enumerate(departments[i+1:], i+1):
            if dept2[0] in visited:
                continue
            
            distance = haversine((dept1[2], dept1[3]), (dept2[2], dept2[3]))
            
            if distance <= distance_threshold:
                merged_group.append(dept2)
                visited.add(dept2[0])
        
        merged_name = " / ".join(set(d[1] for d in merged_group))
        lat = merged_group[0][2]
        lon = merged_group[0][3]
        
        merged_departments.append((merged_name, lat, lon, [d[0] for d in merged_group]))
    
    return merged_departments

def create_dataframe(merged_departments, article_dept_links):
    dept_collab = {dept[0]: {'connections': 0, 'papers': set()} for dept in merged_departments}
    merged_dept_ids = {dept_id: dept[0] for dept in merged_departments for dept_id in dept[3]}
    
    # Collect unique papers and connections
    for dept1, dept2, article_id, _ in article_dept_links:
        merged_dept1 = merged_dept_ids.get(dept1)
        merged_dept2 = merged_dept_ids.get(dept2)
        
        if merged_dept1 and merged_dept2:
            if merged_dept1 != merged_dept2:
                dept_collab[merged_dept1]['connections'] += 1
                dept_collab[merged_dept2]['connections'] += 1
                
            dept_collab[merged_dept1]['papers'].add(article_id)
            dept_collab[merged_dept2]['papers'].add(article_id)
    
    # Debugging: Print collected data
    #print("Department Collaboration Data:")
    #for dept, data in dept_collab.items():
        #print(f"Department: {dept}, Connections: {data['connections']}, Papers: {len(data['papers'])}")

    # Calculate average connections per paper
    for dept in dept_collab:
        papers_count = len(dept_collab[dept]['papers'])
        if papers_count > 0:
            dept_collab[dept]['avg_connections_per_paper'] = dept_collab[dept]['connections'] / papers_count
        else:
            dept_collab[dept]['avg_connections_per_paper'] = 0
    
    # Create DataFrame
    df = pd.DataFrame([
        {'Department': dept, 'Avg Connections per Paper': data['avg_connections_per_paper']}
        for dept, data in dept_collab.items()
    ])
    
    df = df.sort_values(by='Avg Connections per Paper', ascending=False).reset_index(drop=True)
    
    return df

# Main execution
departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()

# Step 1: Merge departments by proximity
merged_departments = merge_departments_by_proximity(departments, distance_threshold=1.0)

# Step 2: Create and rank the DataFrame
df_ranked_departments = create_dataframe(merged_departments, article_dept_links)

# Display the ranked DataFrame
for i in range(0, 10):
    print(str(i+1), df_ranked_departments.Department.iloc[i])


0 Department of Microbiology and Immunology, University of Melbourne, Peter Doherty Institute for Infection and Immunity, Melbourne, VIC 3000, Australia.  / Department of Microbiology and Immunology, University of Melbourne, Peter Doherty Institute for Infection and Immunity, Melbourne, VIC 3000, Australia. / Department of Anatomy and Physiology, University of Melbourne, Parkville, VIC 3010, Australia. / Department of Microbiology and Immunology, University of Melbourne, Peter Doherty Institute for Infection and Immunity, Melbourne, VIC 3000, Australia
1 Regeneron Genetics Center, Tarrytown, NY, USA.
2 AncestryDNA, Lehi, UT, USA.
3 Department of HostMicrobe Interactions, St. Jude Children's Research Hospital, Memphis, TN 38105, USA / Department of HostMicrobe Interactions, St. Jude Children's Research Hospital, Memphis, TN 38105, USA. / Department of Infectious Diseases, St. Jude Children's Research Hospital, Memphis, TN 38105, USA. / Center for Infectious Diseases Research, St. Jude C

In [None]:
!pip3 install shapely

## Absolute Prevalence - Countries

In [56]:
import json
import folium
import storage
from shapely.geometry import shape, Point

# Load the GeoJSON file
with open('worldcountries.geojson') as f:
    countries_geojson = json.load(f)
    
def get_country_from_point(lat, lon, countries_geojson):
    point = Point(lon, lat)
    for feature in countries_geojson['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature['properties']['COUNTRY']  # Use the correct property name for country
    return None
def count_articles_by_country(departments, countries_geojson):
    country_article_count = {}
    
    for dept in departments:
        dept_id, dept_name, lat, lon = dept
        country = get_country_from_point(lat, lon, countries_geojson)
        if country:
            if country not in country_article_count:
                country_article_count[country] = 0
            articles = storage.retrieve_articles_by_department(dept_id)
            country_article_count[country] += len(articles)
    
    return country_article_count
def rank_countries_by_articles(country_article_count):
    ranked_countries = sorted(country_article_count.items(), key=lambda item: item[1], reverse=True)
    return ranked_countries
def visualize_country_article_distribution(countries_geojson, country_article_count):
    m = folium.Map(location=[0, 0], zoom_start=2)
    
    # Define a color scale based on the article count
    max_articles = max(country_article_count.values())
    min_articles = min(country_article_count.values())
    color_scale = folium.LinearColormap(['green', 'yellow', 'red'], vmin=min_articles, vmax=max_articles)
    
    # Add countries to the map
    for feature in countries_geojson['features']:
        country_name = feature['properties']['COUNTRY']  # Adjust based on GeoJSON structure
        article_count = country_article_count.get(country_name, 0)
        
        folium.GeoJson(
            feature,
            style_function=lambda feature, count=article_count: {
                'fillColor': color_scale(count),
                'color': 'black',
                'weight': 0.5,
                'fillOpacity': 0.7
            },
            tooltip=f'{country_name}: {article_count} articles'
        ).add_to(m)
    
    m.add_child(color_scale)
    return m

In [54]:
# Step 1: Load departments and articles
departments = storage.retrieve_all_departments()

# Step 2: Count articles by country
country_article_count = count_articles_by_country(departments, countries_geojson)

# Step 3: Rank countries by article count
ranked_countries = rank_countries_by_articles(country_article_count)

# Print the ranking
for rank, (country, count) in enumerate(ranked_countries, 1):
    print(f"{rank}. {country}: {count} articles")

# Step 4: Visualize on a map (optional)
map_ = visualize_country_article_distribution(countries_geojson, country_article_count)

# Save the map to an HTML file
map_.save("country_article_distribution_map.html")

# Display the map in Jupyter
display(IFrame('country_article_distribution_map.html', width=800, height=600))

1. United States: 240 articles
2. Australia: 48 articles
3. Netherlands: 42 articles
4. Germany: 34 articles
5. Spain: 31 articles
6. Brazil: 25 articles
7. United Kingdom: 22 articles
8. China: 20 articles
9. South Africa: 17 articles
10. Kenya: 15 articles
11. Switzerland: 13 articles
12. Denmark: 12 articles
13. Tanzania: 10 articles
14. Italy: 8 articles
15. Canada: 8 articles
16. Austria: 7 articles
17. Japan: 6 articles
18. Madagascar: 6 articles
19. India: 5 articles
20. France: 5 articles
21. Senegal: 5 articles
22. Peru: 4 articles
23. Mozambique: 4 articles
24. Mali: 4 articles
25. Ireland: 4 articles
26. Norway: 4 articles
27. Belgium: 3 articles
28. Guatemala: 3 articles
29. Slovenia: 2 articles
30. Uganda: 2 articles
31. Thailand: 2 articles
32. Greece: 2 articles
33. Argentina: 2 articles
34. Philippines: 2 articles
35. Zimbabwe: 2 articles
36. Côte d'Ivoire: 2 articles
37. Sweden: 2 articles
38. Israel: 1 articles
39. Poland: 1 articles
40. Jamaica: 1 articles
41. Croati

## Rank Countries by Frequency of Collaboration

In [58]:


import json
import folium
from shapely.geometry import shape, Point
from collections import defaultdict
import storage

# Load the GeoJSON file
with open('worldcountries.geojson') as f:
    countries_geojson = json.load(f)

# Function to get the country from geographic coordinates
def get_country_from_point(lat, lon, countries_geojson):
    point = Point(lon, lat)
    for feature in countries_geojson['features']:
        polygon = shape(feature['geometry'])
        if polygon.contains(point):
            return feature['properties']['COUNTRY']  # Adjust property name as per your GeoJSON structure
    return None

# Function to count collaborations between countries
def count_country_collaborations(departments, article_dept_links, countries_geojson):
    department_to_country = {}
    country_collaboration_count = defaultdict(int)
    
    # Map each department to its respective country
    for dept in departments:
        dept_id, dept_name, lat, lon = dept
        country = get_country_from_point(lat, lon, countries_geojson)
        if country:
            department_to_country[dept_id] = country
    
    # Analyze collaborations by checking if two departments (from different countries) share an article
    for link in article_dept_links:
        dept1_id, dept2_id, article_id, journal_title = link
        country1 = department_to_country.get(dept1_id)
        country2 = department_to_country.get(dept2_id)
        
        # Count only international collaborations
        if country1 and country2 and country1 != country2:
            country_collaboration_count[country1] += 1
            country_collaboration_count[country2] += 1
            
    return country_collaboration_count

# Function to rank countries based on collaborations
def rank_countries_by_collaborations(country_collaboration_count):
    ranked_countries = sorted(country_collaboration_count.items(), key=lambda item: item[1], reverse=True)
    return ranked_countries

# Function to visualize the collaborations on a map
def visualize_country_collaborations(countries_geojson, country_collaboration_count):
    m = folium.Map(location=[0, 0], zoom_start=2)
    
    # Define a color scale based on the collaboration count
    max_collaborations = max(country_collaboration_count.values())
    min_collaborations = min(country_collaboration_count.values())
    color_scale = folium.LinearColormap(['green', 'yellow', 'red'], vmin=min_collaborations, vmax=max_collaborations)
    
    # Add countries to the map with collaboration counts
    for feature in countries_geojson['features']:
        country_name = feature['properties']['COUNTRY']  # Adjust property name as per your GeoJSON structure
        collaboration_count = country_collaboration_count.get(country_name, 0)
        
        folium.GeoJson(
            feature,
            style_function=lambda feature, count=collaboration_count: {
                'fillColor': color_scale(count),
                'color': 'black',
                'weight': 0.5,
                'fillOpacity': 0.7
            },
            tooltip=f'{country_name}: {collaboration_count} collaborations'
        ).add_to(m)
    
    m.add_child(color_scale)
    return m

# Main execution
departments = storage.retrieve_all_departments()
article_dept_links = storage.get_article_department_links()

# Count collaborations
country_collaboration_count = count_country_collaborations(departments, article_dept_links, countries_geojson)

# Rank countries by collaborations
ranked_countries = rank_countries_by_collaborations(country_collaboration_count)
print("Top 10 Countries by Collaborations:")
for country, count in ranked_countries[:10]:
    print(f'{country}: {count} collaborations')

# Visualize the collaborations on a map
collaboration_map = visualize_country_collaborations(countries_geojson, country_collaboration_count)

# Save or display the map
collaboration_map.save("country_collaborations_map.html")
display(IFrame('country_collaborations_map.html', width=800, height=600))

Top 10 Countries by Collaborations:
United States: 3296 collaborations
Australia: 1352 collaborations
United Kingdom: 972 collaborations
South Africa: 630 collaborations
Spain: 554 collaborations
Kenya: 530 collaborations
Switzerland: 502 collaborations
Germany: 500 collaborations
Italy: 426 collaborations
Austria: 392 collaborations
