In [1]:
import pandas as pd
import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from utils import sections

In [2]:
# Load the data
data = pd.read_csv("data/survey_output.csv")  # Update with the actual file path or data source


# Filter data by cohort
us_data = data[data['Which country are you located in?'] == 'United States']
other_data = data[data['Which country are you located in?'] != 'United States']


In [3]:
# Initialize a results DataFrame to store differences
results = []

# Loop through each question
for section, questions in sections.items():
    for question in questions:
        if question not in data.columns:
            continue  # Skip questions not in the dataset

        # Calculate frequencies and percentages for United States
        us_freq = us_data[question].value_counts(normalize=True).reset_index()
        us_freq.columns = ['Response', 'US_Percentage']
        us_freq['US_Percentage'] = (us_freq['US_Percentage'] * 100).round(2)

        # Calculate frequencies and percentages for Other countries
        other_freq = other_data[question].value_counts(normalize=True).reset_index()
        other_freq.columns = ['Response', 'Other_Percentage']
        other_freq['Other_Percentage'] = (other_freq['Other_Percentage'] * 100).round(2)

        # Merge the two frequency tables
        combined_freq = pd.merge(us_freq, other_freq, on='Response', how='outer').fillna(0)

        # Calculate the absolute difference in percentages
        combined_freq['Difference'] = (combined_freq['US_Percentage'] - combined_freq['Other_Percentage']).abs()

        # Add the question and section to the results
        combined_freq['Question'] = question
        combined_freq['Section'] = section

        # Append to the results list
        results.append(combined_freq)

# Concatenate all results into a single DataFrame
results_df = pd.concat(results, ignore_index=True)

# Sort by the greatest difference in percentages
results_df = results_df.sort_values(by='Difference', ascending=False)

# Display the questions with the greatest differences
results_df_top = results_df[['Section', 'Question', 'Response', 'US_Percentage', 'Other_Percentage', 'Difference']]


In [4]:

# Display the top results
print("Questions with the greatest differences between United States and Other countries:")
print(results_df_top.head(20))  # Show top 20 differences

Questions with the greatest differences between United States and Other countries:
                     Section  \
3               Demographics   
2               Demographics   
94     Policy and Adaptation   
89     Policy and Adaptation   
84     Policy and Adaptation   
86     Policy and Adaptation   
87     Policy and Adaptation   
91     Policy and Adaptation   
10              Demographics   
14              Demographics   
119  Value and Communication   
117  Value and Communication   
46           Program Details   
127          Brand Awareness   
108  Value and Communication   
103      Holistic Admissions   
97       Holistic Admissions   
96       Holistic Admissions   
115  Value and Communication   
68         AI and Admissions   

                                              Question  \
3                    Which country are you located in?   
2                    Which country are you located in?   
94   What solutions is your institution considering...   
89   What ar

In [5]:
import plotly.express as px

# Filter the top 20 rows for visualization
results_df_top_20 = results_df_top.head(20)

# Create an interactive bar chart using Plotly
fig = px.bar(
    results_df_top_20,
    x="Difference",
    y="Question",
    color="Response",
    orientation="h",
    text="Difference",
    hover_data=["US_Percentage", "Other_Percentage"],
    title="Top 20 Questions with Greatest Differences Between United States and Other Countries",
    labels={
        "Difference": "Absolute Difference (%)",
        "Question": "Survey Question",
        "Response": "Response"
    }
)

# Update layout for better readability
fig.update_traces(textposition="outside")
fig.update_layout(
    yaxis=dict(title="Survey Question"),
    xaxis=dict(title="Absolute Difference (%)"),
    title_x=0.5,
    height=800  # Adjust height for better visualization
)

# Show the interactive plot
fig.show()

In [6]:
# Export the results to an interactive HTML table
results_df_top.to_html("greatest_differences.html", index=False, escape=False)

print("Interactive HTML table saved as 'greatest_differences.html'")

Interactive HTML table saved as 'greatest_differences.html'


In [8]:
# Export the top 20 results to a CSV file
results_df_top.to_csv("data/greatest_differences.csv", index=False)

print("CSV file saved as 'greatest_differences.csv'")

CSV file saved as 'greatest_differences.csv'
