In [None]:
## Logistic Regression on fitting GPT-4 Text-CoT

## Input: Train and test table statistics created

In [59]:
# Function that Z-scores datasets
scale_df <- function(df) {
    new_df <- data.frame(scale(df[2:6]))
    new_df$index <- as.factor(df$index)
    new_df$correct <- df$correct
    return(new_df)
}

In [19]:
all_df <- read.table(file = 'text_cot_train_table.tsv', sep = '\t', header = TRUE)
all_df$input_nchars <- NULL
all_df$input_ntokens <- NULL

In [31]:
test_df <- read.table(file = "text_cot_test_table.tsv", sep = '\t', header = TRUE)
test_df$input_ntokens <- NULL
bin_info = test_df$bin
test_df$bin <- NULL

In [20]:
head(all_df)

Unnamed: 0_level_0,index,input_logprob,output_logprob,shift_level,shift_freq,shift,correct
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>
1,0,-34.43444,-14.99727,1,59,1,1
2,1,-35.56096,-14.99698,1,59,1,1
3,2,-40.20804,-14.99656,1,59,1,1
4,3,-37.75704,-15.0077,1,59,1,1
5,4,-37.78588,-14.99173,1,59,1,1
6,5,-36.97999,-14.9895,1,59,1,1


In [22]:
shift_level <- all_df$shift_level
all_df$shift_level <- NULL

In [23]:
nrow(all_df)

In [112]:
all_df <- scale_df(all_df)

In [24]:
# Take a look at the data frame we have loaded
head(all_df)

Unnamed: 0_level_0,index,input_logprob,output_logprob,shift_freq,shift,correct
Unnamed: 0_level_1,<int>,<dbl>,<dbl>,<int>,<int>,<int>
1,0,-34.43444,-14.99727,59,1,1
2,1,-35.56096,-14.99698,59,1,1
3,2,-40.20804,-14.99656,59,1,1
4,3,-37.75704,-15.0077,59,1,1
5,4,-37.78588,-14.99173,59,1,1
6,5,-36.97999,-14.9895,59,1,1


In [25]:
# This creates the model
# We are trying to predict "correct" from "input_logprob", "output_logprob", 
# "input_ntokens", "output_ntokens", and "input_nchars"
# "correct" is a binary variable. That is why we are using *logistic* regression, 
# as indicated by "family = binomial
model <- glm(correct ~  input_logprob + output_logprob + shift + shift_freq, 
               data=all_df, family=binomial)

In [26]:
# Gives us the summary. We can see that output_logprob
# has a significant effect (p-vale < 0.05), but the other factors
# do not show a significant effect.
summary(model)


Call:
glm(formula = correct ~ input_logprob + output_logprob + shift + 
    shift_freq, family = binomial, data = all_df)

Coefficients:
                 Estimate Std. Error z value Pr(>|z|)    
(Intercept)     6.637e-01  2.279e-01   2.913  0.00359 ** 
input_logprob  -1.867e-02  5.746e-03  -3.250  0.00115 ** 
output_logprob  4.175e-02  1.887e-03  22.123  < 2e-16 ***
shift          -1.177e-01  5.762e-03 -20.432  < 2e-16 ***
shift_freq      2.056e-03  9.135e-05  22.507  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 16540  on 12499  degrees of freedom
Residual deviance: 15270  on 12495  degrees of freedom
AIC: 15280

Number of Fisher Scoring iterations: 4


In [27]:
y_pred = predict(model, all_df, type = "response")

In [34]:
test_df$correct <- y_pred
test_df$bin <- bin_info

In [12]:
y_true <- all_df$correct

In [35]:
show(test_df)

     index input_logprob output_logprob shift_level shift_freq correct  bin
1        0     -37.07418      -15.11689           1         59       1 bin1
2        1     -36.77916      -14.88298           1         59       1 bin1
3        2     -42.75416      -15.11914           1         59       1 bin1
4        3     -39.20563      -14.87983           1         59       1 bin1
5        4     -38.72959      -14.87982           1         59       1 bin1
6        5     -38.65123      -14.87975           1         59       1 bin1
7        6     -36.10777      -14.87868           1         59       1 bin1
8        7     -33.50487      -14.87630           1         59       1 bin1
9        8     -36.33075      -14.87518           1         59       1 bin1
10       9     -36.87409      -15.12571           1         59       1 bin1
11      10     -38.47073      -15.12681           1         59       1 bin1
12      11     -35.92784      -14.87225           1         59       1 bin1
13      12  

In [36]:
write.table(test_df, file = "text_cot_test_results.tsv", sep = "\t", row.names = FALSE)

In [28]:
all_df$pred <- y_pred
all_df$shift_level <- shift_level
write.table(all_df, file = "text_cot_train_results.tsv", sep = "\t", row.names = FALSE)