In [63]:
# Import the cleaned data set
import pandas as pd

file_path = '../data/cleaned_data.csv'
data = pd.read_csv(file_path)
data.head()

Unnamed: 0,id,stage,accuracy,completeness,innovation,difficulty,n_user_messages,n_internet_resources,time_to_complete_sec,cohort,assignment,llm_experience
0,0,ideation,7,2,1,6.0,0.0,23.0,2075.0,expert,llm_internet,Used a few times
1,0,acquisition,8,6,0,4.0,0.0,22.0,3175.0,expert,llm_internet,Used a few times
2,0,magnification,4,3,0,6.0,0.0,17.0,2158.0,expert,llm_internet,Used a few times
3,0,formulation,3,2,0,2.0,0.0,5.0,1151.0,expert,llm_internet,Used a few times
4,0,release,6,2,0,5.0,0.0,23.0,1816.0,expert,llm_internet,Used a few times


In [209]:
# Import the necessary libraries
from scipy.stats import ttest_ind, t
import matplotlib.pyplot as plt
import numpy as np

In [224]:
# Define statistical analysis helper functions
# These actually perform the statistical tests

def compare_performance(data, metric, group_by='assignment', compare_groups=('llm_internet', 'internet_only')):
    # Split the data into groups
    group1 = data[data[group_by] == compare_groups[0]][metric]
    group2 = data[data[group_by] == compare_groups[1]][metric]
    
    # Calculate mean values and sample variances 
    mean1, mean2 = group1.mean(), group2.mean()

    # Perform t-test (Welch's). We don't need a distribution of t-values because we're directly getting the p_value
    t_stat, p_value = ttest_ind(group1, group2, nan_policy='omit', equal_var=False)
    
    return {
        'mean_with_llms': mean1,
        'mean_without_llms': mean2,
        't_stat': abs(t_stat),
        'p_value': p_value,
        'significant': p_value < 0.05 
    }

In [214]:
# Define statistical analysis main functions
# TODO: understand how I would adapt this function to a different dataset

def compare_performance_across_multiple_segments(data, metrics, group_by='assignment', compare_groups=None, segment_dimensions=None):
    """
    Compare performance across multiple metrics, allowing flexible segmentation along specified dimensions.
    """
    results = []
    
    # Generate a list of unique segment combinations if multiple segment dimensions are specified
    if segment_dimensions and len(segment_dimensions) > 1:
        segment_combinations = data.groupby(segment_dimensions).size().reset_index()[segment_dimensions].values.tolist()
    else:
        segment_combinations = [(segment,) for segment in data[segment_dimensions[0]].unique()] if segment_dimensions else [None]
    
    for segment in segment_combinations:
        # Filter data for the current combination of segments
        if segment[0] is not None:
            segment_data = data.copy()
            for dim, val in zip(segment_dimensions, segment):
                segment_data = segment_data[segment_data[dim] == val]
        else:
            segment_data = data
        
        # Automatically determine comparison groups if not specified
        if compare_groups is None:
            compare_groups = segment_data[group_by].unique()
        
        for metric in metrics:
            if len(compare_groups) >= 2:
                # Perform comparison using the first two groups
                comparison_result = compare_performance(segment_data, metric, group_by, compare_groups[:2])
                
                # Construct result entry
                result_entry = {
                    'Metric': metric,
                    **comparison_result
                }
                for dim, val in zip(segment_dimensions, segment):
                    result_entry[dim] = val  # Add segment dimension(s) to result
                
                results.append(result_entry)
    
    return pd.DataFrame(results)


In [212]:
data.shape

# TODO: why do my result tables have so few rows?

(500, 12)

# Accuracy

In [225]:
# Segmenting by stage
metrics_to_compare = ['accuracy']
segment_dimensions = ['cohort', 'stage']  

accuracy_comparison_results = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)


In [226]:
styled_table = accuracy_comparison_results.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,stage
0,accuracy,4.76,3.52,1.484205,0.144292,False,expert,acquisition
1,accuracy,5.16,4.08,2.21069,0.031914,True,expert,formulation
2,accuracy,7.32,6.6,0.790747,0.43305,False,expert,ideation
3,accuracy,4.64,3.52,2.547209,0.014603,True,expert,magnification
4,accuracy,4.08,3.4,1.278099,0.208393,False,expert,release
5,accuracy,2.16,3.24,1.775829,0.083172,False,student,acquisition
6,accuracy,4.0,3.12,1.514243,0.136522,False,student,formulation
7,accuracy,7.0,6.76,0.255223,0.799642,False,student,ideation
8,accuracy,3.48,2.52,1.740761,0.088971,False,student,magnification
9,accuracy,3.36,3.12,0.479361,0.633991,False,student,release


In [240]:
# Segmenting by llm experience
metrics_to_compare = ['accuracy']
segment_dimensions = ['cohort', 'llm_experience']  

accuracy_comparison_results_exp = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)

In [241]:
styled_table = accuracy_comparison_results_exp.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,llm_experience
0,accuracy,5.033333,4.2,1.701773,0.091297,False,expert,Never used
1,accuracy,5.533333,3.6,1.53804,0.180565,False,expert,Use at least once every few weeks
2,accuracy,5.171429,4.32,1.612998,0.111146,False,expert,Used a few times
3,accuracy,4.2,,,,False,student,Never used
4,accuracy,4.0,3.4,1.177558,0.241264,False,student,Use almost every day
5,accuracy,4.125,4.333333,0.325405,0.745932,False,student,Use at least once every few weeks
6,accuracy,3.7,3.933333,0.280342,0.780775,False,student,Used a few times


#### Interpreting Our Findings
To interpret the implications of the t-stats (which have no meaning in-of themselves), we need to look at the distributions or grab the p-value. If the p-value is high (>0.05), then regardless of the what the t-value suggests, we do not have statistically significant results. 

A higher t-statistic with a low p-value indicates a statistically significant higher mean for the first group.

In our case:
- Expert accuracy in the formulation and magnification stages were positively impacted
- No other statistically significant findings related to stage
- No other statiscially significant findings related to previous experience

# Completeness

In [227]:
metrics_to_compare = ['completeness']
segment_dimensions = ['cohort', 'stage']  

completeness_comparison_results = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)


In [228]:
styled_table = completeness_comparison_results.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,stage
0,completeness,5.64,4.52,1.688464,0.097934,False,expert,acquisition
1,completeness,3.16,2.2,2.673552,0.010771,True,expert,formulation
2,completeness,5.32,4.64,1.204594,0.234268,False,expert,ideation
3,completeness,3.36,2.24,2.009851,0.050783,False,expert,magnification
4,completeness,2.36,2.0,1.160285,0.25168,False,expert,release
5,completeness,4.04,3.64,0.741249,0.462156,False,student,acquisition
6,completeness,2.48,1.72,1.81433,0.076748,False,student,formulation
7,completeness,4.64,4.36,0.59805,0.552784,False,student,ideation
8,completeness,1.6,1.44,0.354594,0.724498,False,student,magnification
9,completeness,2.2,1.76,1.26526,0.21252,False,student,release


In [242]:
# Segmenting by llm experience
metrics_to_compare = ['completeness']
segment_dimensions = ['cohort', 'llm_experience']  

completeness_comparison_results_exp = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)

In [243]:
styled_table = completeness_comparison_results_exp.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,llm_experience
0,completeness,4.116667,3.314286,1.999045,0.047913,True,expert,Never used
1,completeness,3.7,2.4,1.925668,0.079448,False,expert,Use at least once every few weeks
2,completeness,3.942857,2.92,2.431489,0.017253,True,expert,Used a few times
3,completeness,2.2,,,,False,student,Never used
4,completeness,2.766667,2.538462,0.666258,0.506494,False,student,Use almost every day
5,completeness,3.45,2.3,2.629561,0.010566,True,student,Use at least once every few weeks
6,completeness,2.95,2.966667,0.028277,0.977565,False,student,Used a few times


#### Interpreting Our Findings
Our findings:
- Expert completeness in the formulation stage was positively impacted
- No other statistically significant findings related to stage
- For experts who never used an LLM, or have only used a few times, completeness was impacted
- For students who use at least every few weeks, completeness was impacted

# Innovation 

In [229]:
metrics_to_compare = ['innovation']
segment_dimensions = ['cohort', 'stage']  

innovation_comparison_results = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)


In [230]:
styled_table = innovation_comparison_results.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,stage
0,innovation,0.68,0.44,0.911465,0.366609,False,expert,acquisition
1,innovation,1.6,1.16,0.900149,0.373948,False,expert,formulation
2,innovation,1.6,1.52,0.237217,0.813498,False,expert,ideation
3,innovation,0.56,0.12,1.942017,0.060911,False,expert,magnification
4,innovation,0.4,0.32,0.498703,0.620464,False,expert,release
5,innovation,0.2,0.24,0.301511,0.764351,False,student,acquisition
6,innovation,0.72,0.72,0.0,1.0,False,student,formulation
7,innovation,1.92,1.32,1.538293,0.131583,False,student,ideation
8,innovation,0.2,0.52,1.380474,0.173836,False,student,magnification
9,innovation,0.36,0.12,1.66946,0.103677,False,student,release


In [244]:
# Segmenting by llm experience
metrics_to_compare = ['innovation']
segment_dimensions = ['cohort', 'llm_experience']  

innovation_comparison_results_exp = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)

In [245]:
styled_table = innovation_comparison_results_exp.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,llm_experience
0,innovation,1.033333,0.757143,1.150872,0.252626,False,expert,Never used
1,innovation,0.6,0.2,1.636131,0.137646,False,expert,Use at least once every few weeks
2,innovation,1.171429,0.7,1.70118,0.093861,False,expert,Used a few times
3,innovation,0.4,,,,False,student,Never used
4,innovation,0.7,0.630769,0.331986,0.74048,False,student,Use almost every day
5,innovation,0.55,0.533333,0.07846,0.937706,False,student,Use at least once every few weeks
6,innovation,0.95,0.533333,1.010728,0.322168,False,student,Used a few times


#### Interpreting Our Findings
Our findings:
- No statistically significant findings

# Difficulty

In [231]:
metrics_to_compare = ['difficulty']
segment_dimensions = ['cohort', 'stage']  

difficulty_comparison_results = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)


In [232]:
styled_table = difficulty_comparison_results.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,stage
0,difficulty,6.041667,6.96,1.259188,0.214461,False,expert,acquisition
1,difficulty,3.875,4.48,0.960675,0.341635,False,expert,formulation
2,difficulty,4.458333,4.32,0.194563,0.846597,False,expert,ideation
3,difficulty,4.041667,3.92,0.155278,0.877292,False,expert,magnification
4,difficulty,2.916667,2.52,0.903942,0.370877,False,expert,release
5,difficulty,6.96,7.04,0.123462,0.902292,False,student,acquisition
6,difficulty,4.52,5.64,1.505652,0.138714,False,student,formulation
7,difficulty,4.72,5.16,0.643634,0.522903,False,student,ideation
8,difficulty,3.6,5.08,2.23186,0.030439,True,student,magnification
9,difficulty,2.88,3.08,0.520814,0.604891,False,student,release


In [246]:
# Segmenting by llm experience
metrics_to_compare = ['difficulty']
segment_dimensions = ['cohort', 'llm_experience']  

difficulty_comparison_results_exp = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)

In [247]:
styled_table = difficulty_comparison_results_exp.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,llm_experience
0,difficulty,4.266667,4.614286,0.745186,0.457527,False,expert,Never used
1,difficulty,4.366667,4.4,0.054777,0.956944,False,expert,Use at least once every few weeks
2,difficulty,4.166667,4.2,0.054978,0.956322,False,expert,Used a few times
3,difficulty,5.2,,,,False,student,Never used
4,difficulty,4.166667,5.138462,2.16787,0.032096,True,student,Use almost every day
5,difficulty,4.375,5.633333,2.097754,0.039667,True,student,Use at least once every few weeks
6,difficulty,5.8,4.9,1.142116,0.25981,False,student,Used a few times


#### Interpreting Our Findings
Our findings:
- Student perception of difficulty in the formulation stage was reduced
- No other statistically significant findings with respect to stage
- Student perception of difficulty was impacted especially if they used almost every day, or at least every few weeks -- this is interesting!

# Time Taken

In [233]:
metrics_to_compare = ['time_to_complete_sec']
segment_dimensions = ['cohort', 'stage']  

time_to_complete_comparison_results = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)

In [234]:
styled_table = time_to_complete_comparison_results.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,stage
0,time_to_complete_sec,2835.48,2930.375,0.24806,0.805183,False,expert,acquisition
1,time_to_complete_sec,1702.75,2026.88,0.916923,0.363868,False,expert,formulation
2,time_to_complete_sec,2378.56,2035.88,1.104617,0.275142,False,expert,ideation
3,time_to_complete_sec,2175.16,2206.04,0.079191,0.937225,False,expert,magnification
4,time_to_complete_sec,1911.12,1583.8,1.204146,0.234532,False,expert,release
5,time_to_complete_sec,2828.36,3555.76,1.782344,0.08252,False,student,acquisition
6,time_to_complete_sec,1888.5,1782.8,0.439677,0.662201,False,student,formulation
7,time_to_complete_sec,2521.64,1798.8,2.651962,0.011054,True,student,ideation
8,time_to_complete_sec,2199.24,2431.56,0.587298,0.559845,False,student,magnification
9,time_to_complete_sec,1401.56,1606.36,0.681323,0.500206,False,student,release


In [248]:
# Segmenting by llm experience
metrics_to_compare = ['time_to_complete_sec']
segment_dimensions = ['cohort', 'llm_experience']  

time_comparison_results_exp = compare_performance_across_multiple_segments(
    data=data,
    metrics=metrics_to_compare,
    group_by='assignment',
    segment_dimensions=segment_dimensions
)

In [249]:
styled_table = time_comparison_results_exp.style.apply(
    lambda x: ['background-color: lightblue' if v else '' for v in x], 
    subset=['significant']
)
styled_table


Unnamed: 0,Metric,mean_with_llms,mean_without_llms,t_stat,p_value,significant,cohort,llm_experience
0,time_to_complete_sec,2404.169492,2187.623188,0.976159,0.330884,False,expert,Never used
1,time_to_complete_sec,1874.133333,1165.4,1.702154,0.116217,False,expert,Use at least once every few weeks
2,time_to_complete_sec,2151.542857,2197.42,0.18475,0.853878,False,expert,Used a few times
3,time_to_complete_sec,2458.0,,,,False,student,Never used
4,time_to_complete_sec,1949.338983,2196.4,1.10259,0.272461,False,student,Use almost every day
5,time_to_complete_sec,2373.55,2044.866667,1.02017,0.311932,False,student,Use at least once every few weeks
6,time_to_complete_sec,2342.55,2509.0,0.426564,0.671709,False,student,Used a few times


#### Interpreting Our Findings
Our findings:
- Student time to complete in the ideation stage was increased (???)
- No other statistically significant findings with respect to stage or LLM experience