In [1]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers evaluate accelerate -U

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Change working directory to choice
    %cd /content/drive/MyDrive/LLM4BeSci/reading

## Processing data

In [2]:
import pandas as pd
from evaluate import load

In [9]:
# Load the data
clear = pd.read_excel(
    'clear.xlsx',  
    usecols=['Excerpt', 'BT_easiness',  'Flesch-Reading-Ease']
)

# Samples 500 rows from clear for easy computation
clear = clear.sample(1000, random_state=42).reset_index(drop=True)
clear

Unnamed: 0,Excerpt,BT_easiness,Flesch-Reading-Ease
0,An honest and poor old woman was washing cloth...,-0.052742,76.81
1,Our plate illustrates the residence of Mr. J. ...,-2.978524,62.22
2,Just as wildebeest are the main grazers of the...,-2.459246,46.35
3,"The day had become misty and overcast. Heavy, ...",-0.909047,72.72
4,"A solvent is a substance, that becomes a solut...",-1.758207,47.58
...,...,...,...
995,The phonograph was developed as a result of Th...,-1.482660,48.79
996,They now hurried on to the boat as if anticipa...,-1.287723,68.99
997,"So Mandy on the door-step, and Bub on the floo...",-0.994653,92.59
998,When scientists talk about working memory they...,-0.368359,52.87


## Compute perplexity

In [4]:
# Load perplexity metric
perplexity = load("perplexity", module_type="metric")

# Compute perplexity
clear['perplexity'] = perplexity.compute(
    model_id= 'openai-community/gpt2',
    predictions=clear['Excerpt'],
    batch_size=16,
    # device defaults to 'cuda' if available, otherwise 'cpu'
)['perplexities']

clear

Downloading tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Using pad_token, but it is not set yet.


  0%|          | 0/63 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Correlations and linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [None]:
regr = LinearRegression()

def print_cv_stats(X, y):
    """Prints the mean and standard deviation of the r2 scores from a 10-fold cross-validation."""
    scores = cross_val_score(regr, X, y, cv=10)
    mean, sd = scores.mean(), scores.std()
    print(f"Mean r2: {mean:.2f}, SD: {sd:.2f}")

# Perplexity
print_cv_stats(clear[['perplexity']], clear['BT_easiness'])

In [None]:
# Flesch-Reading-Ease
print_cv_stats(clear[['Flesch-Reading-Ease']], clear['BT_easiness'])

In [None]:
# Flesch-Reading-Ease and perplexity
print_cv_stats(clear[['Flesch-Reading-Ease', 'perplexity']], clear['BT_easiness'])