# DiaGuide LLM Analysis

## Comparative Evaluation of Large Language Models and Healthcare Professionals in Diabetes Guidance

This notebook reproduces the results and figures from the study comparing GPT-4o responses with healthcare professional responses to diabetes-related questions.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from scipy.stats import chi2_contingency, mannwhitneyu, kruskal
import warnings
warnings.filterwarnings('ignore')

# Set the style for the plots
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('colorblind')
plt.rcParams.update({
    'figure.figsize': (12, 8),
    'font.size': 12,
    'axes.titlesize': 14,
    'axes.labelsize': 12
})


## 1. Load the dataset


In [3]:
df = pd.read_csv('ratings_v310125.csv')
print(f"Dataset shape: {df.shape}")

# Display basic information about the dataset
df.info()


Dataset shape: (1810, 22)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1810 entries, 0 to 1809
Data columns (total 22 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   id                            1810 non-null   int64  
 1   session_id                    1810 non-null   float64
 2   answer_id                     1810 non-null   int64  
 3   question_id                   1810 non-null   int64  
 4   knowledge                     1810 non-null   int64  
 5   helpfulness                   1810 non-null   int64  
 6   empathy                       1810 non-null   int64  
 7   response_time                 1810 non-null   int64  
 8   created_at                    1810 non-null   object 
 9   user_id                       1810 non-null   int64  
 10  age_group                     1810 non-null   object 
 11  gender                        1810 non-null   object 
 12  has_diabetes                  1810 n

## 2. Data Preprocessing and Cleaning


In [4]:
# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values per column:")
print(missing_values[missing_values > 0])

# Convert columns to appropriate types if needed
# Convert binary columns to boolean
df['is_healthcare_personnel'] = df['is_healthcare_personnel'].astype(bool)
df['previous_participation'] = df['previous_participation'].astype(bool)

# Display basic statistics
print("\nBasic statistics:")
df.describe()



Missing values per column:
healthcare_professional_type    1287
end_time                         357
dtype: int64

Basic statistics:


Unnamed: 0,id,session_id,answer_id,question_id,knowledge,helpfulness,empathy,response_time,user_id
count,1810.0,1810.0,1810.0,1810.0,1810.0,1810.0,1810.0,1810.0,1810.0
mean,910.18674,211.045304,116.976796,58.742541,3.785635,3.702762,3.576796,69588.76,211.045304
std,527.364021,116.756569,66.261643,33.122201,1.019117,1.020543,1.07173,321618.1,116.756569
min,1.0,2.0,1.0,1.0,1.0,1.0,1.0,4106.0,2.0
25%,453.25,114.0,61.0,31.0,3.0,3.0,3.0,24912.25,114.0
50%,910.5,212.0,132.0,66.0,4.0,4.0,4.0,40443.5,212.0
75%,1366.75,302.0,174.0,87.0,5.0,4.0,4.0,64631.75,302.0
max,1855.0,451.0,228.0,114.0,5.0,5.0,5.0,7953561.0,451.0


## 3. Exploratory Data Analysis

### 3.1 Dataset Overview


In [5]:
# Count unique participants, questions, and answers
unique_participants = df['user_id'].nunique()
unique_questions = df['question_id'].nunique()
unique_answers = df['answer_id'].nunique()

print(f"\nUnique participants: {unique_participants}")
print(f"Unique questions: {unique_questions}")
print(f"Unique answers: {unique_answers}")

# Count LLM vs Human responses
df['source'] = df['source'].str.lower()  # Ensure consistency in source labels
llm_responses = df[df['source'] == 'llm'].shape[0]
human_responses = df[df['source'] == 'human'].shape[0]

print(f"\nLLM responses: {llm_responses}")
print(f"Human responses: {human_responses}")
print(f"Total ratings: {df.shape[0]}")

# Calculate average ratings per user
ratings_per_user = df.groupby('user_id').size()
avg_ratings_per_user = ratings_per_user.mean()
median_ratings_per_user = ratings_per_user.median()

print(f"\nAverage ratings per user: {avg_ratings_per_user:.1f}")
print(f"Median ratings per user: {median_ratings_per_user}")



Unique participants: 273
Unique questions: 113
Unique answers: 203

LLM responses: 890
Human responses: 920
Total ratings: 1810

Average ratings per user: 6.6
Median ratings per user: 10.0


### 3.2 Participant Demographics


In [6]:
def generate_demographic_table():
    # Participant groups
    groups = {
        'Diabetes and Healthcare Professional': df[(df['has_diabetes'] == 'yes') & 
                                                 (df['is_healthcare_personnel'] == True)],
        'Only Diabetes': df[(df['has_diabetes'] == 'yes') & 
                           (df['is_healthcare_personnel'] == False)],
        'Only Healthcare Professional': df[(df['has_diabetes'] == 'no') & 
                                         (df['is_healthcare_personnel'] == True)],
        'Neither': df[(df['has_diabetes'] == 'no') & 
                     (df['is_healthcare_personnel'] == False)]
    }
    
    demographic_data = []
    
    # Count unique users in each group
    for group_name, group_df in groups.items():
        unique_users = group_df['user_id'].nunique()
        ratings_count = group_df.shape[0]
        demographic_data.append({
            'Characteristic': 'Participant and Groups',
            'Subgroup': group_name,
            'Frequency (N)': unique_users,
            'Ratings': ratings_count
        })
    
    # Age groups
    age_groups = df.groupby('age_group')
    for age, group in age_groups:
        demographic_data.append({
            'Characteristic': 'Age Group',
            'Subgroup': age,
            'Frequency (N)': group['user_id'].nunique(),
            'Ratings': group.shape[0]
        })
        
    # Gender
    gender_groups = df.groupby('gender')
    for gender, group in gender_groups:
        demographic_data.append({
            'Characteristic': 'Gender',
            'Subgroup': gender,
            'Frequency (N)': group['user_id'].nunique(),
            'Ratings': group.shape[0]
        })
        
    # Education Level
    edu_groups = df.groupby('education_level')
    for edu, group in edu_groups:
        demographic_data.append({
            'Characteristic': 'Education Level',
            'Subgroup': edu,
            'Frequency (N)': group['user_id'].nunique(),
            'Ratings': group.shape[0]
        })
        
    # Healthcare Professional Type
    hcp_groups = df[df['is_healthcare_personnel']].groupby('healthcare_professional_type')
    for hcp_type, group in hcp_groups:
        if pd.notna(hcp_type):  # Skip NaN values
            demographic_data.append({
                'Characteristic': 'Healthcare Professional Type',
                'Subgroup': hcp_type,
                'Frequency (N)': group['user_id'].nunique(),
                'Ratings': group.shape[0]
            })
    
    # Create a DataFrame for better display
    demographic_table = pd.DataFrame(demographic_data)
    
    return demographic_table

# Generate and display the demographic table
demographic_table = generate_demographic_table()
demographic_table


Unnamed: 0,Characteristic,Subgroup,Frequency (N),Ratings
0,Participant and Groups,Diabetes and Healthcare Professional,48,322
1,Participant and Groups,Only Diabetes,175,1181
2,Participant and Groups,Only Healthcare Professional,33,211
3,Participant and Groups,Neither,17,96
4,Age Group,0-19,4,14
5,Age Group,20-29,41,238
6,Age Group,30-39,44,264
7,Age Group,40-49,74,539
8,Age Group,50-59,70,510
9,Age Group,60+,40,245


### 3.3 Descriptive Statistics


In [7]:
def descriptive_stats_by_source():
    metrics = ['knowledge', 'helpfulness', 'empathy']
    stats_data = []
    
    for metric in metrics:
        for source in ['llm', 'human']:
            source_data = df[df['source'] == source][metric]
            
            stats_data.append({
                'Metric': metric.capitalize(),
                'Source': source.upper(),
                'n': len(source_data),
                'Mean': source_data.mean(),
                'SD': source_data.std(),
                '95% CI Lower': source_data.mean() - 1.96 * (source_data.std() / np.sqrt(len(source_data))),
                '95% CI Upper': source_data.mean() + 1.96 * (source_data.std() / np.sqrt(len(source_data))),
                'Median': source_data.median(),
                'Min': source_data.min(),
                'Max': source_data.max()
            })
    
    stats_df = pd.DataFrame(stats_data)
    stats_df['95% CI'] = stats_df.apply(lambda x: f"[{x['95% CI Lower']:.2f}, {x['95% CI Upper']:.2f}]", axis=1)
    
    # Reformat the dataframe to match the paper's table
    formatted_stats = stats_df[['Metric', 'Source', 'n', 'Mean', 'SD', '95% CI', 'Median']]
    formatted_stats['Mean (SD)'] = formatted_stats.apply(lambda x: f"{x['Mean']:.2f} ({x['SD']:.2f})", axis=1)
    
    return formatted_stats[['Metric', 'Source', 'n', 'Mean (SD)', '95% CI', 'Median']]

# Generate descriptive statistics table
desc_stats_table = descriptive_stats_by_source()
desc_stats_table


Unnamed: 0,Metric,Source,n,Mean (SD),95% CI,Median
0,Knowledge,LLM,890,3.88 (0.99),"[3.82, 3.95]",4.0
1,Knowledge,HUMAN,920,3.69 (1.04),"[3.63, 3.76]",4.0
2,Helpfulness,LLM,890,3.82 (0.97),"[3.76, 3.89]",4.0
3,Helpfulness,HUMAN,920,3.59 (1.06),"[3.52, 3.66]",4.0
4,Empathy,LLM,890,3.70 (1.04),"[3.63, 3.77]",4.0
5,Empathy,HUMAN,920,3.46 (1.09),"[3.39, 3.53]",4.0
