In [1]:
import pandas as pd
import itertools
import re
from collections import defaultdict
import networkx as nx

df = pd.read_csv(r"D:\MS\Indiana University Bloomington\Classes\ENGR-E 583 Information Visualization (Dr. Katy Börner & Michael Ginda)\Client Project\Week 5\final_preprocessed_dataset.csv")

df['Venue_Year'] = df['Venue'] + "_" + df['Year Started'].astype(str)

df['Start_Date'] = pd.to_datetime(df['Start Date']).dt.strftime('%Y-%m-%d')
df['End_Date'] = pd.to_datetime(df['End Date']).dt.strftime('%Y-%m-%d')

def split_organizers(org):
    if pd.isna(org):
        return []
    return [o.strip() for o in re.split(r',|/|&|;', str(org)) if o.strip()]

df['Organizer List'] = df['Organizer'].apply(split_organizers)

organizer_to_venues = defaultdict(set)
for _, row in df.iterrows():
    for org in row['Organizer List']:
        organizer_to_venues[org].add(row['Venue_Year'])

edges = []
for org, venues in organizer_to_venues.items():
    for v1, v2 in itertools.combinations(sorted(venues), 2):
        edges.append((v1, v2, org))

title_agg = df.groupby('Venue_Year')['Title'].apply(
    lambda x: ' | '.join(sorted(set(str(i) for i in x if pd.notna(i))))
).reset_index()

organizer_agg = df.groupby('Venue_Year')['Organizer'].apply(
    lambda x: ' | '.join(sorted(set(str(i) for i in x if pd.notna(i))))
).reset_index()

meta_df = df[['Venue_Year', 'Venue', 'City', 'State', 'Country', 'Year Started', 'Start_Date', 'End_Date']].drop_duplicates(subset='Venue_Year')

nodes_df = meta_df.merge(title_agg, on='Venue_Year').merge(organizer_agg, on='Venue_Year')

nodes_df = nodes_df.rename(columns={
    'Venue_Year': 'Id',
    'Venue': 'Venue_Name',
    'City': 'City',
    'State': 'State',
    'Country': 'Country',
    'Year Started': 'Year',
    'Title': 'Title',
    'Organizer': 'Organizers',
    'Start_Date': 'start',
    'End_Date': 'end'
})

G = nx.Graph()
G.add_edges_from([(src, tgt) for src, tgt, _ in edges])
degree_dict = dict(G.degree())
nodes_df['Degree'] = nodes_df['Id'].map(degree_dict)

edges_df = pd.DataFrame(edges, columns=['Source', 'Target', 'Organizer'])
edges_df['Weight'] = edges_df.groupby(['Source', 'Target'])['Organizer'].transform('count')
edges_df['Weight'] = edges_df['Weight'].astype(float)
edges_df = edges_df.drop_duplicates(subset=['Source', 'Target'])
edges_df = edges_df.sort_values(by='Weight', ascending=False)

nodes_df.to_excel("nodes.xlsx", index=False)
edges_df.to_excel("edges.xlsx", index=False)

flourish_links = edges_df[['Source', 'Target', 'Weight']].rename(columns={
    'Source': 'source',
    'Target': 'target',
    'Weight': 'value'
})
flourish_links.to_excel("flourish_links.xlsx", index=False)

nodes_df['Year_Str'] = nodes_df['Year'].astype(str)

flourish_points = nodes_df[[
    'Id', 'Country', 'City', 'Venue_Name', 'State', 'Title',
    'Organizers', 'start', 'end', 'Degree', 'Year_Str'
]].rename(columns={
    'Id': 'id',
    'Country': 'group',
    'City': 'city',
    'Venue_Name': 'venue',
    'State': 'state',
    'Title': 'title',
    'Organizers': 'organizer',
    'start': 'start_date',
    'end': 'end_date',
    'Degree': 'size',
    'Year_Str': 'year',
})

flourish_points['label'] = flourish_points['city']

cols = list(flourish_points.columns)
cols.insert(2, cols.pop(cols.index('label')))
flourish_points = flourish_points[cols]

flourish_points.to_excel("flourish_points.xlsx", index=False)

In [5]:
flourish_points.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          442 non-null    object 
 1   group       442 non-null    object 
 2   label       432 non-null    object 
 3   city        432 non-null    object 
 4   venue       442 non-null    object 
 5   state       251 non-null    object 
 6   title       442 non-null    object 
 7   organizer   442 non-null    object 
 8   start_date  442 non-null    object 
 9   end_date    442 non-null    object 
 10  size        186 non-null    float64
 11  year        442 non-null    object 
dtypes: float64(1), object(11)
memory usage: 41.6+ KB


In [17]:
# Intervals

flourish_points['year'] = flourish_points['year'].astype(int)

def get_interval(year):
    if 2005 <= year <= 2009:
        return '2005–2009'
    elif 2010 <= year <= 2014:
        return '2010–2014'
    elif 2015 <= year <= 2019:
        return '2015–2019'
    elif 2020 <= year <= 2024:
        return '2020–2024'
    elif year == 2025:
        return '2025'
    else:
        return 'Out of Range'

flourish_points['interval'] = flourish_points['year'].apply(get_interval)

In [21]:
flourish_points.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          442 non-null    object 
 1   group       442 non-null    object 
 2   label       432 non-null    object 
 3   city        432 non-null    object 
 4   venue       442 non-null    object 
 5   state       251 non-null    object 
 6   title       442 non-null    object 
 7   organizer   442 non-null    object 
 8   start_date  442 non-null    object 
 9   end_date    442 non-null    object 
 10  size        186 non-null    float64
 11  year        442 non-null    int32  
 12  interval    442 non-null    object 
dtypes: float64(1), int32(1), object(11)
memory usage: 43.3+ KB


In [23]:
flourish_points

Unnamed: 0,id,group,label,city,venue,state,title,organizer,start_date,end_date,size,year,interval
0,101st Annual Meeting of the Association of Ame...,United States,Denver,Denver,101st Annual Meeting of the Association of Ame...,Colorado,101st Annual Meeting of the Association of Ame...,"Deborah MacPherson, Katy Börner",2005-04-05,2005-04-09,69.0,2005,2005–2009
1,Lorentz Workshop_2005,Netherlands,Leiden,Leiden,Lorentz Workshop,,Simulating the Social Processes of Science,Andrea Scharnhorst,2005-04-07,2005-04-11,15.0,2005,2005–2009
2,"SLIS, Indiana University_2005",United States,Bloomington,Bloomington,"SLIS, Indiana University",Indiana,Networks and Complex Systems Talk Series,Peter A. Hook,2005-04-25,2005-04-25,2.0,2005,2005–2009
3,Abdus Salam International Centre for Theoretic...,Italy,Trieste,Trieste,Abdus Salam International Centre for Theoretic...,,School and Workshop on Structure and Function ...,Katy Börner,2005-05-15,2005-05-27,63.0,2005,2005–2009
4,University of Illinois_2005,United States,Urbana-Champaign,Urbana-Champaign,University of Illinois,Illinois,Understanding Complex Systems 2004 Symposium,Peter A. Hook,2005-05-17,2005-05-20,2.0,2005,2005–2009
...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,City of Bloomington_2025,United States,Bloomington,Bloomington,City of Bloomington,Indiana,Places & Spaces: Mapping Science,Aubrey Seader,2025-04-01,2025-04-30,,2025,2025
438,Arts Plaza_2025,United States,Bloomington,Bloomington,Arts Plaza,Indiana,First Thursdays Festival,Arts and Humanities Council,2025-04-03,2025-04-03,,2025,2025
439,COSMO_2025,Germany,,,COSMO,,Places & Spaces: Mapping Science,Jörg Neumann,2025-05-18,2025-08-08,,2025,2025
440,SBHD_2025,Germany,,,SBHD,,Systems Biology of Human Disease,Franziska Gudrun Muller,2025-06-16,2025-06-18,,2025,2025
