In [1]:
import pandas as pd
import numpy as np

# Data setup
judges = ['A', 'B', 'C', 'D', 'E']
criteria = ['Report/MA Cited', 'Deliverable Completion', 'PDAC Trial Impact', 'Funding Probability']

# Scoring data [Judge][Proposal][Criteria]
scores = {
    'A': {
        'A': [9.5, 9.5, 9.0, 8.5], 'B': [9.0, 9.0, 8.5, 9.0], 'C': [7.5, 8.0, 7.5, 7.0],
        'D': [8.5, 9.0, 9.5, 8.0], 'E': [6.0, 6.5, 6.0, 5.5]
    },
    'B': {
        'A': [8.5, 9.5, 8.5, 8.0], 'B': [9.5, 9.0, 9.5, 8.5], 'C': [7.0, 7.5, 7.0, 6.5],
        'D': [8.0, 8.5, 8.0, 7.5], 'E': [5.5, 6.0, 5.5, 5.0]
    },
    'C': {
        'A': [9.5, 10.0, 9.0, 9.5], 'B': [9.0, 9.0, 8.5, 8.0], 'C': [7.0, 8.5, 6.0, 5.5],
        'D': [9.8, 9.5, 10.0, 9.8], 'E': [5.0, 6.0, 5.0, 5.0]
    },
    'D': {
        'A': [9.5, 9.5, 9.0, 8.8], 'B': [8.4, 8.3, 8.0, 7.9], 'C': [7.3, 7.6, 7.1, 6.7],
        'D': [9.0, 9.0, 9.2, 8.4], 'E': [6.1, 6.4, 6.0, 5.6]
    },
    'E': {
        'A': [9.0, 9.5, 9.0, 8.5], 'B': [9.0, 9.0, 9.0, 9.5], 'C': [8.5, 8.5, 8.5, 8.0],
        'D': [8.0, 8.0, 8.0, 9.0], 'E': [7.0, 7.0, 7.0, 7.0]
    }
}

# Calculate self-scoring bias
bias_data = []
for judge in judges:
    self_score = np.mean(scores[judge][judge])
    other_scores = [np.mean(scores[judge][prop]) for prop in judges if prop != judge]
    avg_other_score = np.mean(other_scores)
    bias = self_score - avg_other_score

    bias_data.append({
        'Judge': f'{judge} ({["ChatGPT o3-pro", "Claude Opus 4", "Gemini 2.5 Pro", "ChatGPT o3", "Grok 3 Think"][ord(judge)-ord("A")]})',
        'Self Score': round(self_score, 2),
        'Avg Others Score': round(avg_other_score, 2),
        'Bias Score': round(bias, 2),
        'Bias Type': 'Lenient' if bias > 0.5 else 'Harsh' if bias < -0.5 else 'Neutral'
    })

df_bias = pd.DataFrame(bias_data)
print("1. Self-Scoring Bias Analysis Table")
print("="*50)
print(df_bias.to_string(index=False))
print(f"\nAverage Bias: {df_bias['Bias Score'].mean():.2f}")

1. Self-Scoring Bias Analysis Table
             Judge  Self Score  Avg Others Score  Bias Score Bias Type
A (ChatGPT o3-pro)        9.12              7.78        1.34   Lenient
 B (Claude Opus 4)        9.12              7.28        1.84   Lenient
C (Gemini 2.5 Pro)        6.75              8.29       -1.54     Harsh
    D (ChatGPT o3)        8.90              7.64        1.26   Lenient
  E (Grok 3 Think)        7.00              8.69       -1.69     Harsh

Average Bias: 0.24


In [2]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

# Calculate overall scores for each judge-proposal combination
judge_scores = {}
for judge in judges:
    judge_scores[judge] = []
    for prop in ['A', 'B', 'C', 'D', 'E']:
        judge_scores[judge].append(np.mean(scores[judge][prop]))

# Create correlation matrix
correlation_matrix = np.zeros((5, 5))
p_values = np.zeros((5, 5))

for i, judge1 in enumerate(judges):
    for j, judge2 in enumerate(judges):
        if i != j:
            corr, p_val = pearsonr(judge_scores[judge1], judge_scores[judge2])
            correlation_matrix[i][j] = corr
            p_values[i][j] = p_val
        else:
            correlation_matrix[i][j] = 1.0

# Create DataFrame
judge_names = ["ChatGPT o3-pro", "Claude Opus 4", "Gemini 2.5 Pro", "ChatGPT o3", "Grok 3 Think"]
df_corr = pd.DataFrame(correlation_matrix,
                      index=[f'{judges[i]} ({judge_names[i]})' for i in range(5)],
                      columns=[f'{judges[i]}' for i in range(5)])

print("\n2. Inter-Rater Correlation Matrix")
print("="*50)
print(df_corr.round(3).to_string())
print(f"\nAverage Inter-Rater Correlation: {np.mean(correlation_matrix[correlation_matrix != 1.0]):.3f}")


2. Inter-Rater Correlation Matrix
                        A      B      C      D      E
A (ChatGPT o3-pro)  1.000  0.967  0.961  0.961  0.911
B (Claude Opus 4)   0.967  1.000  0.875  0.864  0.951
C (Gemini 2.5 Pro)  0.961  0.875  1.000  0.988  0.760
D (ChatGPT o3)      0.961  0.864  0.988  1.000  0.784
E (Grok 3 Think)    0.911  0.951  0.760  0.784  1.000

Average Inter-Rater Correlation: 0.902


In [3]:
import pandas as pd
import numpy as np
from scipy.stats import kendalltau

# Calculate rankings for each judge
rankings = {}
for judge in judges:
    judge_totals = [(prop, np.mean(scores[judge][prop])) for prop in ['A', 'B', 'C', 'D', 'E']]
    judge_totals.sort(key=lambda x: x[1], reverse=True)
    rankings[judge] = [prop for prop, _ in judge_totals]

# Calculate Kendall's tau for consensus
consensus_data = []
judge_pairs = [(i, j) for i in range(5) for j in range(i+1, 5)]

for i, j in judge_pairs:
    judge1, judge2 = judges[i], judges[j]
    tau, p_value = kendalltau([rankings[judge1].index(prop) for prop in ['A', 'B', 'C', 'D', 'E']],
                             [rankings[judge2].index(prop) for prop in ['A', 'B', 'C', 'D', 'E']])
    consensus_data.append({
        'Judge Pair': f'{judge1} vs {judge2}',
        'Kendall Tau': round(tau, 3),
        'P-value': round(p_value, 3),
        'Agreement': 'Strong' if abs(tau) > 0.7 else 'Moderate' if abs(tau) > 0.4 else 'Weak'
    })

# Create ranking table
ranking_df = pd.DataFrame(rankings)
ranking_df.index = ['1st', '2nd', '3rd', '4th', '5th']

print("\n3. Ranking Consensus Table")
print("="*50)
print("Rankings by Judge:")
print(ranking_df.to_string())
print("\nPairwise Agreement:")
print(pd.DataFrame(consensus_data).to_string(index=False))


3. Ranking Consensus Table
Rankings by Judge:
     A  B  C  D  E
1st  A  B  D  A  B
2nd  B  A  A  D  A
3rd  D  D  B  B  C
4th  C  C  C  C  D
5th  E  E  E  E  E

Pairwise Agreement:
Judge Pair  Kendall Tau  P-value Agreement
    A vs B          0.8    0.083    Strong
    A vs C          0.6    0.233  Moderate
    A vs D          0.8    0.083    Strong
    A vs E          0.6    0.233  Moderate
    B vs C          0.4    0.483      Weak
    B vs D          0.6    0.233  Moderate
    B vs E          0.8    0.083    Strong
    C vs D          0.8    0.083    Strong
    C vs E          0.2    0.817      Weak
    D vs E          0.4    0.483      Weak


In [4]:
import pandas as pd
import numpy as np

# Calculate severity/leniency metrics
severity_data = []
for judge in judges:
    all_scores = []
    for prop in ['A', 'B', 'C', 'D', 'E']:
        all_scores.extend(scores[judge][prop])

    avg_score = np.mean(all_scores)
    std_score = np.std(all_scores)
    min_score = np.min(all_scores)
    max_score = np.max(all_scores)
    range_score = max_score - min_score

    severity_data.append({
        'Judge': f'{judge} ({judge_names[ord(judge)-ord("A")]})',
        'Average Score': round(avg_score, 2),
        'Std Deviation': round(std_score, 2),
        'Score Range': round(range_score, 2),
        'Min Score': round(min_score, 1),
        'Max Score': round(max_score, 1),
        'Tendency': 'Lenient' if avg_score > 8.0 else 'Harsh' if avg_score < 7.5 else 'Moderate'
    })

df_severity = pd.DataFrame(severity_data)
print("\n4. Judge Severity/Leniency Index")
print("="*50)
print(df_severity.to_string(index=False))


4. Judge Severity/Leniency Index
             Judge  Average Score  Std Deviation  Score Range  Min Score  Max Score Tendency
A (ChatGPT o3-pro)           8.05           1.23          4.0        5.5        9.5  Lenient
 B (Claude Opus 4)           7.65           1.35          4.5        5.0        9.5 Moderate
C (Gemini 2.5 Pro)           7.98           1.83          5.0        5.0       10.0 Moderate
    D (ChatGPT o3)           7.89           1.20          3.9        5.6        9.5 Moderate
  E (Grok 3 Think)           8.35           0.81          2.5        7.0        9.5  Lenient


In [5]:
import pandas as pd
import numpy as np

# Calculate discrimination power for each criterion
discrimination_data = []

for c_idx, criterion in enumerate(criteria):
    criterion_scores = []
    for judge in judges:
        for prop in ['A', 'B', 'C', 'D', 'E']:
            criterion_scores.append(scores[judge][prop][c_idx])

    variance = np.var(criterion_scores)
    std_dev = np.std(criterion_scores)
    score_range = max(criterion_scores) - min(criterion_scores)
    mean_score = np.mean(criterion_scores)

    discrimination_data.append({
        'Criterion': criterion,
        'Mean Score': round(mean_score, 2),
        'Std Deviation': round(std_dev, 2),
        'Variance': round(variance, 2),
        'Score Range': round(score_range, 2),
        'Discrimination Power': 'High' if std_dev > 1.2 else 'Medium' if std_dev > 0.8 else 'Low'
    })

df_discrimination = pd.DataFrame(discrimination_data)
print("\n5. Criteria Discrimination Power Table")
print("="*50)
print(df_discrimination.to_string(index=False))


5. Criteria Discrimination Power Table
             Criterion  Mean Score  Std Deviation  Variance  Score Range Discrimination Power
       Report/MA Cited        8.04           1.34      1.79          4.8                 High
Deliverable Completion        8.33           1.16      1.34          4.0               Medium
     PDAC Trial Impact        7.93           1.36      1.85          5.0                 High
   Funding Probability        7.63           1.42      2.01          4.8                 High


In [6]:
import pandas as pd
import numpy as np

# Calculate summary statistics for each proposal
summary_data = []

for prop in ['A', 'B', 'C', 'D', 'E']:
    all_scores = []
    criterion_averages = []

    for c_idx in range(4):
        criterion_scores = [scores[judge][prop][c_idx] for judge in judges]
        criterion_averages.append(np.mean(criterion_scores))
        all_scores.extend(criterion_scores)

    overall_avg = np.mean(all_scores)
    overall_std = np.std(all_scores)

    summary_data.append({
        'Proposal': f'{prop} ({["ChatGPT o3-pro", "Claude Opus 4", "Gemini 2.5 Pro", "ChatGPT o3", "Grok 3 Think"][ord(prop)-ord("A")]})',
        'Overall Average': round(overall_avg, 2),
        'Overall Std Dev': round(overall_std, 2),
        'Report/MA Cited': round(criterion_averages[0], 2),
        'Deliverable Completion': round(criterion_averages[1], 2),
        'PDAC Trial Impact': round(criterion_averages[2], 2),
        'Funding Probability': round(criterion_averages[3], 2),
        'Best Criterion': criteria[np.argmax(criterion_averages)],
        'Weakest Criterion': criteria[np.argmin(criterion_averages)]
    })

df_summary = pd.DataFrame(summary_data)
print("\n6. Proposal Performance Summary Matrix")
print("="*50)
print(df_summary.to_string(index=False))


6. Proposal Performance Summary Matrix
          Proposal  Overall Average  Overall Std Dev  Report/MA Cited  Deliverable Completion  PDAC Trial Impact  Funding Probability         Best Criterion   Weakest Criterion
A (ChatGPT o3-pro)             9.09             0.49             9.20                    9.60               8.90                 8.66 Deliverable Completion Funding Probability
 B (Claude Opus 4)             8.78             0.48             8.98                    8.86               8.70                 8.58        Report/MA Cited Funding Probability
C (Gemini 2.5 Pro)             7.36             0.81             7.46                    8.02               7.22                 6.74 Deliverable Completion Funding Probability
    D (ChatGPT o3)             8.74             0.73             8.66                    8.80               8.94                 8.54      PDAC Trial Impact Funding Probability
  E (Grok 3 Think)             5.96             0.68             5.92      

In [7]:
import pandas as pd
import numpy as np
from itertools import combinations

# Calculate pairwise comparisons
comparison_data = []
proposals = ['A', 'B', 'C', 'D', 'E']

for prop1, prop2 in combinations(proposals, 2):
    wins_prop1 = 0
    wins_prop2 = 0
    ties = 0

    for judge in judges:
        score1 = np.mean(scores[judge][prop1])
        score2 = np.mean(scores[judge][prop2])

        if score1 > score2:
            wins_prop1 += 1
        elif score2 > score1:
            wins_prop2 += 1
        else:
            ties += 1

    # Calculate average score difference
    avg_diff = np.mean([np.mean(scores[judge][prop1]) - np.mean(scores[judge][prop2]) for judge in judges])

    comparison_data.append({
        'Comparison': f'{prop1} vs {prop2}',
        f'{prop1} Wins': wins_prop1,
        f'{prop2} Wins': wins_prop2,
        'Ties': ties,
        'Winner': prop1 if wins_prop1 > wins_prop2 else prop2 if wins_prop2 > wins_prop1 else 'Tie',
        'Avg Score Difference': round(abs(avg_diff), 2),
        'Consensus': 'Strong' if max(wins_prop1, wins_prop2) >= 4 else 'Moderate' if max(wins_prop1, wins_prop2) >= 3 else 'Weak'
    })

df_pairwise = pd.DataFrame(comparison_data)
print("\n7. Pairwise Proposal Comparison Table")
print("="*50)
print(df_pairwise.to_string(index=False))


7. Pairwise Proposal Comparison Table
Comparison  A Wins  B Wins  Ties Winner  Avg Score Difference Consensus  C Wins  D Wins  E Wins
    A vs B     3.0     2.0     0      A                  0.31  Moderate     NaN     NaN     NaN
    A vs C     5.0     NaN     0      A                  1.73    Strong     0.0     NaN     NaN
    A vs D     4.0     NaN     0      A                  0.35    Strong     NaN     1.0     NaN
    A vs E     5.0     NaN     0      A                  3.14    Strong     NaN     NaN     0.0
    B vs C     NaN     5.0     0      B                  1.42    Strong     0.0     NaN     NaN
    B vs D     NaN     3.0     0      B                  0.04  Moderate     NaN     2.0     NaN
    B vs E     NaN     5.0     0      B                  2.82    Strong     NaN     NaN     0.0
    C vs D     NaN     NaN     0      D                  1.38    Strong     1.0     4.0     NaN
    C vs E     NaN     NaN     0      C                  1.40    Strong     5.0     NaN     0.0
 

In [8]:
import pandas as pd
import numpy as np

# Collect all scores and create frequency distribution
all_scores = []
for judge in judges:
    for prop in ['A', 'B', 'C', 'D', 'E']:
        all_scores.extend(scores[judge][prop])

# Create score bins
score_bins = [(5.0, 6.0), (6.0, 7.0), (7.0, 8.0), (8.0, 9.0), (9.0, 10.0), (10.0, 10.1)]
bin_labels = ['5.0-5.9', '6.0-6.9', '7.0-7.9', '8.0-8.9', '9.0-9.9', '10.0']

distribution_data = []
for i, (lower, upper) in enumerate(score_bins):
    count = sum(1 for score in all_scores if lower <= score < upper)
    percentage = (count / len(all_scores)) * 100

    distribution_data.append({
        'Score Range': bin_labels[i],
        'Frequency': count,
        'Percentage': round(percentage, 1),
        'Cumulative %': round(sum(d['Percentage'] for d in distribution_data) + percentage, 1)
    })

df_distribution = pd.DataFrame(distribution_data)
print("\n8. Score Distribution Frequency Table")
print("="*50)
print(df_distribution.to_string(index=False))
print(f"\nTotal Scores: {len(all_scores)}")
print(f"Mean Score: {np.mean(all_scores):.2f}")
print(f"Median Score: {np.median(all_scores):.2f}")
print(f"Standard Deviation: {np.std(all_scores):.2f}")


8. Score Distribution Frequency Table
Score Range  Frequency  Percentage  Cumulative %
    5.0-5.9          9         9.0           9.0
    6.0-6.9         11        11.0          20.0
    7.0-7.9         16        16.0          36.0
    8.0-8.9         28        28.0          64.0
    9.0-9.9         34        34.0          98.0
       10.0          2         2.0         100.0

Total Scores: 100
Mean Score: 7.98
Median Score: 8.45
Standard Deviation: 1.35


In [9]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr

# Prepare criteria scores
criteria_scores = {criterion: [] for criterion in criteria}

for judge in judges:
    for prop in ['A', 'B', 'C', 'D', 'E']:
        for c_idx, criterion in enumerate(criteria):
            criteria_scores[criterion].append(scores[judge][prop][c_idx])

# Calculate correlation matrix between criteria
correlation_data = []
for i, crit1 in enumerate(criteria):
    for j, crit2 in enumerate(criteria):
        if i <= j:  # Only upper triangle and diagonal
            if i == j:
                correlation_data.append({
                    'Criterion 1': crit1,
                    'Criterion 2': crit2,
                    'Correlation': 1.000,
                    'P-value': 0.000,
                    'Relationship': 'Perfect'
                })
            else:
                corr, p_val = pearsonr(criteria_scores[crit1], criteria_scores[crit2])
                relationship = 'Strong' if abs(corr) > 0.7 else 'Moderate' if abs(corr) > 0.4 else 'Weak'

                correlation_data.append({
                    'Criterion 1': crit1,
                    'Criterion 2': crit2,
                    'Correlation': round(corr, 3),
                    'P-value': round(p_val, 3),
                    'Relationship': relationship
                })

df_cross_corr = pd.DataFrame(correlation_data)
print("\n9. Cross-Criteria Correlation Analysis")
print("="*50)
print(df_cross_corr.to_string(index=False))


9. Cross-Criteria Correlation Analysis
           Criterion 1            Criterion 2  Correlation  P-value Relationship
       Report/MA Cited        Report/MA Cited        1.000      0.0      Perfect
       Report/MA Cited Deliverable Completion        0.950      0.0       Strong
       Report/MA Cited      PDAC Trial Impact        0.965      0.0       Strong
       Report/MA Cited    Funding Probability        0.937      0.0       Strong
Deliverable Completion Deliverable Completion        1.000      0.0      Perfect
Deliverable Completion      PDAC Trial Impact        0.898      0.0       Strong
Deliverable Completion    Funding Probability        0.857      0.0       Strong
     PDAC Trial Impact      PDAC Trial Impact        1.000      0.0      Perfect
     PDAC Trial Impact    Funding Probability        0.934      0.0       Strong
   Funding Probability    Funding Probability        1.000      0.0      Perfect


In [10]:
import pandas as pd
import numpy as np
from scipy.stats import f_oneway, ttest_ind

# ANOVA test for proposal differences
proposal_scores = {}
for prop in ['A', 'B', 'C', 'D', 'E']:
    proposal_scores[prop] = []
    for judge in judges:
        proposal_scores[prop].append(np.mean(scores[judge][prop]))

# One-way ANOVA
f_stat, p_value_anova = f_oneway(*[proposal_scores[prop] for prop in ['A', 'B', 'C', 'D', 'E']])

# Pairwise t-tests between top proposals
significance_data = []

# ANOVA result
significance_data.append({
    'Test': 'One-way ANOVA (All Proposals)',
    'Test Statistic': round(f_stat, 3),
    'P-value': round(p_value_anova, 4),
    'Significant': 'Yes' if p_value_anova < 0.05 else 'No',
    'Interpretation': 'Significant differences exist between proposals' if p_value_anova < 0.05 else 'No significant differences'
})

# Top proposals comparison
top_proposals = ['A', 'B', 'D']  # Based on general performance
for i, prop1 in enumerate(top_proposals):
    for prop2 in top_proposals[i+1:]:
        t_stat, p_val = ttest_ind(proposal_scores[prop1], proposal_scores[prop2])
        significance_data.append({
            'Test': f'{prop1} vs {prop2} (t-test)',
            'Test Statistic': round(t_stat, 3),
            'P-value': round(p_val, 4),
            'Significant': 'Yes' if p_val < 0.05 else 'No',
            'Interpretation': f'{"Significant" if p_val < 0.05 else "No significant"} difference between {prop1} and {prop2}'
        })

df_significance = pd.DataFrame(significance_data)
print("\n10. Statistical Significance Testing Table")
print("="*50)
print(df_significance.to_string(index=False))


10. Statistical Significance Testing Table
                         Test  Test Statistic  P-value Significant                                  Interpretation
One-way ANOVA (All Proposals)          27.285   0.0000         Yes Significant differences exist between proposals
              A vs B (t-test)           1.338   0.2177          No       No significant difference between A and B
              A vs D (t-test)           1.049   0.3248          No       No significant difference between A and D
              B vs D (t-test)           0.126   0.9029          No       No significant difference between B and D


In [11]:
import pandas as pd
import numpy as np

# Calculate averages for each proposal across all judges for each criterion
average_data = []

for prop in ['A', 'B', 'C', 'D', 'E']:
    row_data = {'Proposal': f'{prop} ({judge_names[ord(prop)-ord("A")]})'}

    for c_idx, criterion in enumerate(criteria):
        criterion_scores = [scores[judge][prop][c_idx] for judge in judges]
        avg_score = np.mean(criterion_scores)
        row_data[criterion] = round(avg_score, 2)

    # Calculate overall average
    all_scores = []
    for judge in judges:
        all_scores.extend(scores[judge][prop])
    row_data['Overall Average'] = round(np.mean(all_scores), 2)

    average_data.append(row_data)

df_averages = pd.DataFrame(average_data)
print("\n11. Average Scores Table (All Judges)")
print("="*50)
print(df_averages.to_string(index=False))

# Add ranking based on overall average
df_averages_sorted = df_averages.sort_values('Overall Average', ascending=False)
df_averages_sorted['Rank'] = range(1, len(df_averages_sorted) + 1)
print("\n\nRanked by Overall Average:")
print(df_averages_sorted[['Rank', 'Proposal', 'Overall Average']].to_string(index=False))


11. Average Scores Table (All Judges)
          Proposal  Report/MA Cited  Deliverable Completion  PDAC Trial Impact  Funding Probability  Overall Average
A (ChatGPT o3-pro)             9.20                    9.60               8.90                 8.66             9.09
 B (Claude Opus 4)             8.98                    8.86               8.70                 8.58             8.78
C (Gemini 2.5 Pro)             7.46                    8.02               7.22                 6.74             7.36
    D (ChatGPT o3)             8.66                    8.80               8.94                 8.54             8.74
  E (Grok 3 Think)             5.92                    6.38               5.90                 5.62             5.96


Ranked by Overall Average:
 Rank           Proposal  Overall Average
    1 A (ChatGPT o3-pro)             9.09
    2  B (Claude Opus 4)             8.78
    3     D (ChatGPT o3)             8.74
    4 C (Gemini 2.5 Pro)             7.36
    5   E (Grok 3 Th