# Visualize what fields are most studied in a given region

## Bubble Map Paradigm

In [4]:
import plotly.express as px
import pandas as pd

# Sample data with university name, faculty count, publication count, and field
data = {
    'University': ['University A', 'University B', 'University C', 'University D', 'University E'],
    'FacultyCount': [100, 150, 80, 120, 90],
    'PublicationCount': [50, 80, 30, 60, 40],
    'Field': ['AI', 'Systems', 'Network', 'AI', 'Network']
}

df = pd.DataFrame(data)

# Aggregate faculty count and publication count for each field in the region
agg_df = df.groupby('Field').agg({'FacultyCount': 'sum', 'PublicationCount': 'sum'}).reset_index()

# Create a bubble chart using Plotly Express
fig = px.scatter(agg_df, x='FacultyCount', y='PublicationCount', size='FacultyCount', color='Field',
                 hover_name='Field', title='Popularity of Fields in the Region (Aggregated)',
                 labels={'FacultyCount': 'Aggregated Faculty Count', 'PublicationCount': 'Aggregated Publication Count'},
                 size_max=50, template='plotly_dark')

# Show the plot
fig.show()


## Get Dependencies

In [1]:
import dash
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px
import pandas as pd
import numpy as np
import glob
from utils import get_region_and_field

## Get Data and Visualize

In [47]:
# Get files
all_files = glob.glob('../detailed/Institutions_*_*.csv')

data_dict = {}

fields = []
regions = []
total_faculty_counts = []
total_publication_counts = []

for file in all_files:
    region, field = get_region_and_field(file)
    if field == 'all':
        continue

    df = pd.read_csv(file)
    # get the sum of the faculty count and publication count
    total_faculty_count = df['Faculty Count'].sum()
    total_publication_count = df['Publication Count'].sum()

    # append to the lists
    fields.append(field)
    regions.append(region)
    total_faculty_counts.append(total_faculty_count)
    total_publication_counts.append(total_publication_count)
        

# Create a DataFrame from the aggregated data
aggregated_data = pd.DataFrame({
    'Field': fields,
    'Region': regions,
    'TotalFacultyCount': total_faculty_counts,
    'TotalPublicationCount': total_publication_counts
})

fig = px.scatter(aggregated_data, x='Field', y='Region', color='TotalFacultyCount', size='TotalPublicationCount',
                 title='Popularity of Fields in the Region (Aggregated)', 
                 labels={'TotalFacultyCount': 'Total Faculty Count', 'TotalPublicationCount': 'Total Publication Count'},
                 color_continuous_scale='Viridis', template='plotly_dark')

fig.add_annotation(
    text='Note: The count for each field is added from different number of top institutions in the region available on csranking, not the total institutions in the region. ',
    xref='paper', yref='paper',
    x=0, y=1.05,
    showarrow=False,
    font=dict(size=12, color='white')
)

# Show the plot
fig.show()
fig.write_html('../images/field_study.html')

## Analyze regional discrepancies

- set the x axis to be stdAverageFacultyCount, 
- y axis to be  stdAveragePublicationCount, 
- z axis to be the fields. 
- use different colors for regions 
- and use the size of the bubble ball to represent the AverageDiscrepencies

In [49]:
# Get files
all_files = glob.glob('../Discrepancies/Discrepancy_*_*.csv')

data_dict = {}

fields = []
regions = []
avg_discrep = []
avg_faculty_counts = []
avg_publication_counts = []

sum_fc = []
sum_pc = []

for file in all_files:

    df = pd.read_csv(file)
    # get the sum of the faculty count and publication count
    avg_d = df['Discrepancy'].abs().mean()
    sum_fc.append(df['Faculty Count'].sum())
    sum_pc.append(df['Publication Count'].sum())
    # avg_p = df['Standardized Publication Count'].mean()
    # avg_f = df['Standardized Faculty Count'].mean()

    # append to the lists
    fields.append(df['field'][0])
    regions.append(df['region'][0])
    avg_discrep.append(avg_d)
    # avg_faculty_counts.append(avg_f)
    # avg_publication_counts.append(avg_p)

import numpy as np

sum_fc = np.array(sum_fc)
sum_pc = np.array(sum_pc)

avg_faculty_counts = (sum_fc - np.mean(sum_fc)) / np.std(sum_fc)
avg_faculty_counts = np.abs(avg_faculty_counts).tolist()
avg_publication_counts = (sum_pc - np.mean(sum_pc)) / np.std(sum_pc)
avg_publication_counts = np.abs(avg_publication_counts).tolist()

In [54]:
# Create a DataFrame from the aggregated data
avg_data = pd.DataFrame({
    'Field': fields,
    'Region': regions,
    'AverageDiscrepencies': avg_discrep,
    'stdAverageFacultyCount': avg_faculty_counts,
    'stdAveragePublicationCount': avg_publication_counts
})

# handle NaN in avg_data
avg_data = avg_data.fillna(0)

# Create a 3D scatter plot using Plotly Express
fig = px.scatter_3d(avg_data, x='stdAverageFacultyCount', y='stdAveragePublicationCount', z='Field',
                    color='Region', size='AverageDiscrepencies',
                    title='3D Scatter Plot with Averages and Discrepancies',
                    labels={'stdAverageFacultyCount': 'ABS Standardized Average Faculty Count',
                            'stdAveragePublicationCount': 'ABS Standardized Average Publication Count',
                            'AverageDiscrepencies': 'Average Discrepancies'},
                    # color_discrete_map={'EURO': 'red', 'WORLD': 'green', 'USA': 'blue', 'CA': 'purple'},
                    template='plotly_dark')

"""
set the x axis to be stdAverageFacultyCount, 
y axis to be  stdAveragePublicationCount, 
z axis to be the fields. 
use different colors for regions 
and use the size of the bubble ball to represent the AverageDiscrepencies
"""

# Show the plot
fig.show()


fig.write_html('../images/field_study_dicrep.html')