In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
import pandas as pd
import os
import torch
import numpy as np
import scipy.stats as st
import statsmodels.api as sm 
import math

df = pd.read_csv('data/mathwell_annotations_final.csv')
model_path = "meta-llama/Llama-2-70b-hf"   # Specify the path to the model

env_file_path = 'data/env.txt'

# Read and set environment variables
with open(env_file_path, 'r') as file:
    for line in file:
        key, value = line.strip().split('=')
        os.environ[key] = value
token = os.environ['huggingface_token']
tokenizer = AutoTokenizer.from_pretrained(model_path, token=token)

In [16]:
df = pd.read_csv('data/evaluation_annotations.csv')
df

Unnamed: 0,solvability,accuracy,appropriateness,topic,addition,subtraction,multiplication,division,fractions,decimals,...,solution,answer,gpt35,gpt4,llama,llema,mammoth,mathwell,fkgl,ndc
0,1,1.0,1.0,1,1.0,1.0,0.0,0.0,0.0,0.0,...,def solution():\n #Steph Curry has 3000 poi...,0,0,0,1,0,0,0,0.8,8.73
1,1,1.0,1.0,1,0.0,0.0,1.0,0.0,0.0,0.0,...,def solution():\n #Number of players\n p...,30,0,0,0,0,1,0,1.0,8.10
2,1,0.0,1.0,1,0.0,0.0,0.0,1.0,0.0,1.0,...,def solution():\n #A commercial airplane ta...,0.6,0,0,1,0,0,0,4.7,8.84
3,0,,,1,,,,,,,...,def solution():\n #Raphael is 6 inches tall...,30,0,0,0,1,0,0,5.2,7.77
4,1,1.0,1.0,1,0.0,1.0,0.0,0.0,0.0,0.0,...,def solution():\n #Spiderman needs to load ...,180,0,0,0,1,0,0,5.0,8.67
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0,,,1,,,,,,,...,def solution():\n # Ash starts with 50 Poké...,44,0,1,0,0,0,0,3.8,9.06
1496,0,,,1,,,,,,,...,def solution():\n # Jenny has 12 cats\n ...,4,0,1,0,0,0,0,4.5,7.14
1497,0,,,1,,,,,,,...,def solution():\n # Max finds 3 sticks ever...,5,0,1,0,0,0,0,3.7,9.66
1498,0,,,1,,,,,,,...,def solution():\n # Each pyramid requires 3...,4,0,1,0,0,0,0,6.5,9.49


In [17]:
def check_mathwell(df, colname):
    question_lengths = []
    for i in range(0, len(df)):
        output = df.iloc[i][f"{colname}"]
        try: 
            inputs = tokenizer.encode(output, return_tensors="pt")
        except:
            pass
        length = inputs.shape[1]
        question_lengths.append(length)
    return np.array(question_lengths)

In [18]:
df['question_length'] = check_mathwell(df, 'question')
df['solution_length'] = check_mathwell(df, 'solution')

In [19]:
df.to_csv('data/evaluation_annotations.csv', index = False)
df = pd.read_csv('data/evaluation_annotations.csv')
df

Unnamed: 0,solvability,accuracy,appropriateness,topic,addition,subtraction,multiplication,division,fractions,decimals,...,gpt35,gpt4,llama,llema,mammoth,mathwell,fkgl,ndc,question_length,solution_length
0,1,1.0,1.0,1,1.0,1.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0.8,8.73,62,130
1,1,1.0,1.0,1,0.0,0.0,1.0,0.0,0.0,0.0,...,0,0,0,0,1,0,1.0,8.10,27,57
2,1,0.0,1.0,1,0.0,0.0,0.0,1.0,0.0,1.0,...,0,0,1,0,0,0,4.7,8.84,54,95
3,0,,,1,,,,,,,...,0,0,0,1,0,0,5.2,7.77,80,139
4,1,1.0,1.0,1,0.0,1.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,5.0,8.67,72,107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1495,0,,,1,,,,,,,...,0,1,0,0,0,0,3.8,9.06,83,223
1496,0,,,1,,,,,,,...,0,1,0,0,0,0,4.5,7.14,65,145
1497,0,,,1,,,,,,,...,0,1,0,0,0,0,3.7,9.66,60,116
1498,0,,,1,,,,,,,...,0,1,0,0,0,0,6.5,9.49,67,172


In [23]:
df.columns.tolist()

['solvability',
 'accuracy',
 'appropriateness',
 'topic',
 'addition',
 'subtraction',
 'multiplication',
 'division',
 'fractions',
 'decimals',
 'no_ops',
 'total_ops',
 'good',
 'question',
 'solution',
 'answer',
 'gpt35',
 'gpt4',
 'llama',
 'llema',
 'mammoth',
 'mathwell',
 'fkgl',
 'ndc',
 'question_length',
 'solution_length']

In [47]:
def recode(df):
    if df['no_ops']==1:
        return 0
    else:
        return df['total_ops']
df['total_ops'] = df.apply(recode, axis=1)

In [53]:
reg_df = df[df['solvability']==1]
x = reg_df[['fkgl', 'ndc', 'question_length', 'solution_length', 'addition', 'subtraction',
       'multiplication', 'division', 'fractions', 'decimals', 'llema', 'llama', 'mammoth', 'gpt35', 'gpt4']]
x = sm.add_constant(x)
y = reg_df['accuracy']
log_reg = sm.Logit(y, x).fit() 
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.264355
         Iterations 7


0,1,2,3
Dep. Variable:,accuracy,No. Observations:,1229.0
Model:,Logit,Df Residuals:,1213.0
Method:,MLE,Df Model:,15.0
Date:,"Fri, 24 May 2024",Pseudo R-squ.:,0.1592
Time:,10:01:47,Log-Likelihood:,-324.89
converged:,True,LL-Null:,-386.4
Covariance Type:,nonrobust,LLR p-value:,4.893e-19

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,4.1526,0.889,4.669,0.000,2.409,5.896
fkgl,-0.0017,0.057,-0.030,0.976,-0.113,0.110
ndc,0.0233,0.086,0.272,0.786,-0.145,0.191
question_length,-0.0042,0.005,-0.905,0.365,-0.013,0.005
solution_length,-0.0103,0.003,-4.006,0.000,-0.015,-0.005
addition,0.4493,0.249,1.803,0.071,-0.039,0.938
subtraction,0.4759,0.265,1.797,0.072,-0.043,0.995
multiplication,0.1092,0.253,0.431,0.666,-0.387,0.605
division,-0.2423,0.282,-0.858,0.391,-0.796,0.311


In [58]:
reg_df = df[df['solvability']==1]
x = reg_df[['fkgl', 'ndc', 'question_length', 'solution_length', 'addition', 'subtraction',
       'multiplication', 'division', 'fractions', 'decimals']]
x = sm.add_constant(x)
y = reg_df['accuracy']
log_reg = sm.Logit(y, x).fit() 
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.301228
         Iterations 7


0,1,2,3
Dep. Variable:,accuracy,No. Observations:,1229.0
Model:,Logit,Df Residuals:,1218.0
Method:,MLE,Df Model:,10.0
Date:,"Fri, 24 May 2024",Pseudo R-squ.:,0.04191
Time:,10:07:59,Log-Likelihood:,-370.21
converged:,True,LL-Null:,-386.4
Covariance Type:,nonrobust,LLR p-value:,0.000345

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.5607,0.758,3.379,0.001,1.075,4.046
fkgl,0.0493,0.051,0.958,0.338,-0.052,0.150
ndc,-0.0275,0.081,-0.339,0.734,-0.186,0.131
question_length,-0.0065,0.004,-1.493,0.135,-0.015,0.002
solution_length,-0.0050,0.002,-2.343,0.019,-0.009,-0.001
addition,0.6062,0.236,2.569,0.010,0.144,1.069
subtraction,0.8166,0.253,3.230,0.001,0.321,1.312
multiplication,0.5294,0.240,2.206,0.027,0.059,1.000
division,0.1781,0.269,0.663,0.507,-0.349,0.705


In [50]:
reg_df = df[df['solvability']==1]
x = reg_df[['fkgl', 'ndc', 'question_length', 'solution_length', 'total_ops']]
      # 'llema', 'llama', 'mammoth', 'gpt35', 'gpt4']]
x = sm.add_constant(x)
y = reg_df['accuracy']
log_reg = sm.Logit(y, x).fit() 
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.312156
         Iterations 6


0,1,2,3
Dep. Variable:,accuracy,No. Observations:,1229.0
Model:,Logit,Df Residuals:,1223.0
Method:,MLE,Df Model:,5.0
Date:,"Fri, 24 May 2024",Pseudo R-squ.:,0.007152
Time:,09:56:40,Log-Likelihood:,-383.64
converged:,True,LL-Null:,-386.4
Covariance Type:,nonrobust,LLR p-value:,0.355

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,2.8487,0.729,3.909,0.000,1.420,4.277
fkgl,0.0270,0.049,0.550,0.582,-0.069,0.123
ndc,-0.0512,0.077,-0.664,0.507,-0.202,0.100
question_length,-0.0049,0.004,-1.151,0.250,-0.013,0.003
solution_length,-0.0027,0.002,-1.226,0.220,-0.007,0.002
total_ops,0.2142,0.133,1.610,0.107,-0.047,0.475


In [51]:
reg_df = df[df['solvability']==1]
x = reg_df[['fkgl', 'ndc', 'question_length', 'solution_length', 'total_ops', 'llema', 'llama', 'mammoth', 'gpt35', 'gpt4']]
x = sm.add_constant(x)
y = reg_df['accuracy']
log_reg = sm.Logit(y, x).fit() 
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.270017
         Iterations 7


0,1,2,3
Dep. Variable:,accuracy,No. Observations:,1229.0
Model:,Logit,Df Residuals:,1218.0
Method:,MLE,Df Model:,10.0
Date:,"Fri, 24 May 2024",Pseudo R-squ.:,0.1412
Time:,10:00:50,Log-Likelihood:,-331.85
converged:,True,LL-Null:,-386.4
Covariance Type:,nonrobust,LLR p-value:,8.077e-19

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,4.4999,0.853,5.274,0.000,2.828,6.172
fkgl,-0.0227,0.055,-0.409,0.682,-0.131,0.086
ndc,0.0066,0.082,0.080,0.936,-0.154,0.167
question_length,-0.0023,0.005,-0.489,0.625,-0.012,0.007
solution_length,-0.0087,0.003,-3.477,0.001,-0.014,-0.004
total_ops,0.0687,0.128,0.537,0.591,-0.182,0.319
llema,-2.9736,0.437,-6.804,0.000,-3.830,-2.117
llama,-1.2953,0.447,-2.901,0.004,-2.170,-0.420
mammoth,-0.6216,0.499,-1.246,0.213,-1.600,0.357


In [54]:
reg_df = df[df['solvability']==1]
x = reg_df[['fkgl', 'ndc', 'question_length', 'solution_length',  'llema', 'llama', 'mammoth', 'gpt35', 'gpt4']]
x = sm.add_constant(x)
y = reg_df['accuracy']
log_reg = sm.Logit(y, x).fit() 
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.270135
         Iterations 7


0,1,2,3
Dep. Variable:,accuracy,No. Observations:,1229.0
Model:,Logit,Df Residuals:,1219.0
Method:,MLE,Df Model:,9.0
Date:,"Fri, 24 May 2024",Pseudo R-squ.:,0.1408
Time:,10:02:34,Log-Likelihood:,-332.0
converged:,True,LL-Null:,-386.4
Covariance Type:,nonrobust,LLR p-value:,2.562e-19

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,4.6023,0.834,5.516,0.000,2.967,6.238
fkgl,-0.0252,0.055,-0.456,0.649,-0.133,0.083
ndc,0.0046,0.082,0.056,0.955,-0.156,0.165
question_length,-0.0022,0.005,-0.459,0.646,-0.012,0.007
solution_length,-0.0084,0.002,-3.454,0.001,-0.013,-0.004
llema,-3.0006,0.434,-6.910,0.000,-3.852,-2.150
llama,-1.3033,0.446,-2.920,0.003,-2.178,-0.429
mammoth,-0.6355,0.498,-1.275,0.202,-1.612,0.341
gpt35,-1.2176,0.454,-2.683,0.007,-2.107,-0.328


In [55]:
reg_df = df[df['solvability']==1]
x = reg_df[['fkgl', 'ndc', 'question_length', 'solution_length']]
x = sm.add_constant(x)
y = reg_df['accuracy']
log_reg = sm.Logit(y, x).fit() 
log_reg.summary()

Optimization terminated successfully.
         Current function value: 0.313240
         Iterations 6


0,1,2,3
Dep. Variable:,accuracy,No. Observations:,1229.0
Model:,Logit,Df Residuals:,1224.0
Method:,MLE,Df Model:,4.0
Date:,"Fri, 24 May 2024",Pseudo R-squ.:,0.003707
Time:,10:04:03,Log-Likelihood:,-384.97
converged:,True,LL-Null:,-386.4
Covariance Type:,nonrobust,LLR p-value:,0.5807

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,3.1209,0.717,4.354,0.000,1.716,4.526
fkgl,0.0180,0.049,0.368,0.713,-0.078,0.114
ndc,-0.0576,0.077,-0.745,0.456,-0.209,0.094
question_length,-0.0045,0.004,-1.054,0.292,-0.013,0.004
solution_length,-0.0016,0.002,-0.716,0.474,-0.006,0.003
