In [None]:

# imports
import sys
import utils
from dotenv import load_dotenv
import os
import pandas as pd
import numpy as np
import config
config = config.assessment_sources
SPREADSHEET_ID_SAT = config['spreadsheet_id_SAT']
SPREADSHEET_ID_NAEP = config['spreadsheet_id_NAEP']
SPREADSHEET_ID_LSAT = config['spreadsheet_id_LSAT']
SPREADSHEET_ID_GRE = config['spreadsheet_id_GRE']
SPREADSHEET_ID_GMAT = config['spreadsheet_id_GMAT']
SPREADSHEET_ID_Casper = config['spreadsheet_id_Casper']
SPREADSHEET_ID_AAMC = config['spreadsheet_id_AAMC']
SPREADSHEET_ID_Mapping = config['spreadsheet_id_Mapping']


In [None]:
# connect to google sheets and get dfs
service = utils.connect_to_google('sheets')

# make a list of all the spreadsheet ids in config.assessment_sources
# for each spreadsheet id, get the data from the 'Data' tab
df_sat = utils.read_google_sheet(service, SPREADSHEET_ID_SAT, 'Data')
df_naep = utils.read_google_sheet(service, SPREADSHEET_ID_NAEP, 'Data')
df_lsat = utils.read_google_sheet(service, SPREADSHEET_ID_LSAT, 'Data')
df_gre = utils.read_google_sheet(service, SPREADSHEET_ID_GRE, 'Data')
df_gmat = utils.read_google_sheet(service, SPREADSHEET_ID_GMAT, 'Data')
df_casper = utils.read_google_sheet(service, SPREADSHEET_ID_Casper, 'Data')
df_aamc = utils.read_google_sheet(service, SPREADSHEET_ID_AAMC, 'Data')
df_Mapping = utils.read_google_sheet(service, SPREADSHEET_ID_Mapping, 'Variable')

In [None]:
# add N as last column to df_naep
df_naep['N'] = np.nan

# if a cell in df_mapping is empty put it as missing value
df_Mapping = df_Mapping.replace(r'^\s*$', np.nan, regex=True)

dfs_dict = {'df_sat': df_sat, 
            'df_naep': df_naep, 
            'df_lsat': df_lsat, 
            'df_gre': df_gre, 
            'df_gmat': df_gmat, 
            'df_casper': df_casper, 
            'df_aamc': df_aamc}

In [None]:
# create a mapping dictionay: 
mapping_table = df_Mapping
# Initialize a dictionary of dictionaries to store mappings
mappings = {}

# Iterate over each column in the mapping table (excluding 'CommonTerm' and 'Column')
for col in mapping_table.columns[2:]:
    # Create an empty dictionary for each data frame
    mappings[col] = {}
    
    # Filter the mapping table for each column type (e.g., 'Race', 'Gender')
    for column in mapping_table['Column'].unique():
        filtered_table = mapping_table[mapping_table['Column'] == column]
        # Create a sub-dictionary for each column in each data frame
        mappings[col][column] = dict(zip(filtered_table[col], filtered_table['CommonTerm']))


In [None]:
# Standardize relevant columns in each data frame using the mappings
for name, df in dfs_dict.items():
    for column in df.columns:
        if column in mappings[name]:  # Check if there's a mapping for this column
            df[column] = df[column].map(mappings[name][column]).fillna(df[column])

# Check the results
for name, df in dfs_dict.items():
    print(f"{name} standardized:")
    print(df)
    print()

In [None]:
# Combine all the dataframes from the df_dict into one dataframe
combined_df = pd.concat(dfs_dict.values(), ignore_index=True)


In [None]:
# Create the dictionary
variable_grouping_dict = {}

# Iterate through each unique value in the 'Variable' column
for variable in combined_df['Variable'].unique():
	# Get the unique 'Grouping' values for the current 'Variable'
	groupings = combined_df[combined_df['Variable'] == variable]['Grouping'].unique()
	# Store in the dictionary
	variable_grouping_dict[variable] = list(groupings)

# Print the dictionary
print(variable_grouping_dict)

In [None]:
# clean up data

# set column types  for combined_df
column_types = {
    'Variable': 'category',
    'Subject': 'category',
    'Year': 'int64',
    'Jurisdiction': 'category',
    'Grouping': 'category',
    'Mean': 'float64',
    'SD': 'float64',
    'N': 'float64'
}

# Set the types of each column in combined_df using the dictionary
combined_df = combined_df.astype(column_types)

# Verify the column types
print(combined_df.dtypes)


# Cohen's D

In [None]:
import pandas as pd
import numpy as np

# Reference groups dictionary
reference_groups = {
	'Gender': 'Female',
	'Race/Ethnicity': 'White'
}

# Function to calculate Cohen's d
def cohen_d(mean1, mean2, sd1, sd2):
	pooled_sd = np.sqrt((sd1 ** 2 + sd2 ** 2) / 2)
	return (mean2 - mean1) / pooled_sd

# Function to calculate Cohen's d effect size across 'Grouping'
def calculate_cohens_d(combined_df, reference_groups):
	results = []

	# Iterate through each unique combination of Variable, Subject, Year, and Jurisdiction
	for (variable, subject, year, jurisdiction) in combined_df.groupby(['Variable', 'Subject', 'Year', 'Jurisdiction']).groups.keys():
		# Check if the variable is in the reference_groups dictionary
		if variable not in reference_groups:
			continue
		
		subset_df = combined_df[(combined_df['Variable'] == variable) &
								(combined_df['Subject'] == subject) &
								(combined_df['Year'] == year) &
								(combined_df['Jurisdiction'] == jurisdiction)]
		
		# Identify the reference group
		reference_group = reference_groups[variable]
		
		reference_row = subset_df[subset_df['Grouping'] == reference_group]
		if reference_row.empty:
			continue
		
		mean_ref = reference_row['Mean'].values[0]
		sd_ref = reference_row['SD'].values[0]
		
		# Calculate Cohen's d for each group compared to the reference group
		for _, row in subset_df.iterrows():
			if row['Grouping'] == reference_group:
				continue
			
			mean_grp = row['Mean']
			sd_grp = row['SD']
			d = cohen_d(mean_ref, mean_grp, sd_ref, sd_grp)
			
			results.append({
				'Variable': variable,
				'Subject': subject,
				'Year': year,
				'Jurisdiction': jurisdiction,
				'Reference Group': reference_group,
				'Comparison Group': row['Grouping'],
				'Cohen\'s d': d
			})
	
	return pd.DataFrame(results)

# Example usage
# combined_df = pd.read_csv('path_to_your_csv_file.csv')  # Load your DataFrame
cohens_d_df = calculate_cohens_d(combined_df, reference_groups)
print(cohens_d_df)

In [72]:
merged_df.columns


Index(['Variable', 'Subject', 'Grade', 'Year', 'Jurisdiction', 'Grouping',
       'Mean', 'SD', 'N', 'Cohen's d'],
      dtype='object')

In [None]:
# Rename 'Comparison Group' to 'Grouping' in cohens_d_df to facilitate the merge
cohens_d_df = cohens_d_df.rename(columns={'Comparison Group': 'Grouping'})

# Keep only the 'Cohen's d' column in cohens_d_df
cohens_d_df = cohens_d_df[['Variable', 'Subject', 'Year', 'Jurisdiction', 'Grouping', "Cohen's d"]]

# Merge cohens_d_df with combined_df on the relevant columns
merged_df = combined_df.merge(cohens_d_df, on=['Variable', 'Subject', 'Year', 'Jurisdiction', 'Grouping'], how='left')

# Assign a Cohen's d value of 0 where the reference category matches the grouping
for variable, reference_group in reference_groups.items():
    merged_df.loc[(merged_df['Variable'] == variable) & (merged_df['Grouping'] == reference_group), "Cohen's d"] = 0

# Verify the result
print(merged_df)

In [68]:
# write merged_df to a csv file in data folder
merged_df.to_csv('data/merged_data.csv', index=False)

In [69]:

# Order of subjects for the table
subjects_ordered = [
    'NAEP - Science - 4',
    'NAEP - Reading - 4',
    'NAEP - Reading - 8',
    'NAEP - Science - 8',
    'NAEP - Science - 12',
    'NAEP - Reading - 12',
    'SAT - Total',
    'SAT - Math',
    'SAT - ERW',
    'Casper',
    'MCAT Total',
    'MCAT CPBS',
    'MCAT CARS',
    'MCAT BBLS',
    'MCAT PSBB',
    'AAMC - GPA Total',
    'AAMC - GPA Science',
    'AAMC - GPA Non-Science',
    'GRE - Analytical Writing',
    'GRE - Quantitative',
    'GRE - Verbal',
    'GMAT - Total Score',
    'LSAT'
]

# Define the race order
race_order = [
    'Black',
    'Hispanic',
    'Asian',
    'American Indian/Alaska Native',
    'Native Hawaiian/Other Pacific Islander',
    'Another Race/Ethnicity',
    'Multiple Races/Ethnicities',
    'White'
]

In [70]:

# Filtering the dataframe to only include 'Variable' value 'Race/Ethnicity' and 'Jurisdiction' = 'US'
filtered_df = merged_df[
    (merged_df['Variable'] == 'Race/Ethnicity') & 
    (merged_df['Jurisdiction'] == 'US')
]

# Convert year into category using .loc to avoid SettingWithCopyWarning
filtered_df.loc[:, 'Year'] = filtered_df['Year'].astype('category')

# Creating a pivot table with 'Grouping' as columns,
# 'Subject' and 'Year' as rows, and 'Cohen\'s d' as values
pivot_df = filtered_df.pivot_table(
    index=['Subject', 'Year'], 
    columns='Grouping', 
    values="Cohen's d",
    aggfunc='first'  # This picks the first value if there are duplicates
)

# Reindex the rows to match the specified order
pivot_df = pivot_df.reindex(subjects_ordered, level=0)

# Reorder the columns to match the race order
pivot_df = pivot_df[race_order]

# Combine 'Subject' and 'Year' into a single column for the index
pivot_df.index = pivot_df.index.map(lambda x: f'{x[0]} ({x[1]})')

# Resetting the index for a cleaner look (optional)
pivot_df = pivot_df.reset_index()

# Display the result
pivot_df

Grouping,index,Black,Hispanic,Asian,American Indian/Alaska Native,Native Hawaiian/Other Pacific Islander,Another Race/Ethnicity,Multiple Races/Ethnicities,White
0,NAEP - Science - 4 (2019),-0.996127,-0.734515,0.146024,-0.645576,-0.611066,,-0.126202,0.0
1,NAEP - Reading - 4 (2019),-0.734293,-0.583584,0.248755,-0.69834,-0.51153,,-0.12484,0.0
2,NAEP - Reading - 8 (2019),-0.791885,-0.57078,0.315898,-0.678497,-0.540574,,-0.146389,0.0
3,NAEP - Science - 8 (2019),-1.003581,-0.744069,0.109537,-0.671662,-0.789898,,-0.193384,0.0
4,NAEP - Science - 12 (2019),-1.033937,-0.727622,0.136405,-0.562197,-0.68029,,-0.139099,0.0
5,NAEP - Reading - 12 (2019),-0.781345,-0.524361,0.106373,-0.583162,-0.442316,,0.002694,0.0
6,SAT - Total (2023),-0.78733,-0.628959,0.61991,-0.819005,-0.710407,,0.040724,0.0
7,SAT - Math (2023),-0.745902,-0.581967,0.770492,-0.729508,-0.655738,,0.02459,0.0
8,SAT - ERW (2023),-0.763636,-0.618182,0.390909,-0.836364,-0.7,,0.054545,0.0
9,Casper (2022),-0.705919,-0.412546,-0.021598,-0.435009,,-0.068176,,0.0


In [71]:

# Apply a color gradient using a predefined palette
# cmap: the color map you want to use, e.g., 'coolwarm', 'viridis', 'RdYlBu', etc.
# axis=None applies the gradient cell by cell
styled_df = pivot_df.style.background_gradient(cmap='RdYlBu', axis=None)

# Display the styled DataFrame
styled_df

Grouping,index,Black,Hispanic,Asian,American Indian/Alaska Native,Native Hawaiian/Other Pacific Islander,Another Race/Ethnicity,Multiple Races/Ethnicities,White
0,NAEP - Science - 4 (2019),-0.996127,-0.734515,0.146024,-0.645576,-0.611066,,-0.126202,0.0
1,NAEP - Reading - 4 (2019),-0.734293,-0.583584,0.248755,-0.69834,-0.51153,,-0.12484,0.0
2,NAEP - Reading - 8 (2019),-0.791885,-0.57078,0.315898,-0.678497,-0.540574,,-0.146389,0.0
3,NAEP - Science - 8 (2019),-1.003581,-0.744069,0.109537,-0.671662,-0.789898,,-0.193384,0.0
4,NAEP - Science - 12 (2019),-1.033937,-0.727622,0.136405,-0.562197,-0.68029,,-0.139099,0.0
5,NAEP - Reading - 12 (2019),-0.781345,-0.524361,0.106373,-0.583162,-0.442316,,0.002694,0.0
6,SAT - Total (2023),-0.78733,-0.628959,0.61991,-0.819005,-0.710407,,0.040724,0.0
7,SAT - Math (2023),-0.745902,-0.581967,0.770492,-0.729508,-0.655738,,0.02459,0.0
8,SAT - ERW (2023),-0.763636,-0.618182,0.390909,-0.836364,-0.7,,0.054545,0.0
9,Casper (2022),-0.705919,-0.412546,-0.021598,-0.435009,,-0.068176,,0.0
