In [None]:
import sys
if 'google.colab' in sys.modules:  # If in Google Colab environment
    # Installing requisite packages
    !pip install datasets transformers evaluate accelerate -U

    # Mount google drive to enable access to data files
    from google.colab import drive
    drive.mount('/content/drive')

    # Change working directory to choice
    %cd /content/drive/MyDrive/LLM4BeSci/reading

## Processing data

In [None]:
import pandas as pd
from evaluate import load

In [None]:
# Load the data
clear = pd.read_excel(
    'clear.xlsx',  
    usecols=['Excerpt', 'BT_easiness',  'Flesch-Reading-Ease']
)

# Samples 1000 rows from clear for easy computation
clear = clear.sample(1000, random_state=42).reset_index(drop=True)
clear

## Compute perplexity

In [None]:
# Load perplexity metric
perplexity = load("perplexity", module_type="metric")

# Compute perplexity
clear['perplexity'] = perplexity.compute(
    model_id= 'openai-community/gpt2',
    predictions=clear['Excerpt'],
    batch_size=16,
    # device defaults to 'cuda' if available, otherwise 'cpu'
)['perplexities']

clear

## Correlations and linear regression

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [None]:
regr = LinearRegression()

def print_cv_stats(X, y):
    """Prints the mean and standard deviation of the r2 scores from a 10-fold cross-validation."""
    scores = cross_val_score(regr, X, y, cv=10)
    mean, sd = scores.mean(), scores.std()
    print(f"Mean r2: {mean:.2f}, SD: {sd:.2f}")

# Perplexity
print_cv_stats(clear[['perplexity']], clear['BT_easiness'])

In [None]:
# Flesch-Reading-Ease
print_cv_stats(clear[['Flesch-Reading-Ease']], clear['BT_easiness'])

In [None]:
# Flesch-Reading-Ease and perplexity
print_cv_stats(clear[['Flesch-Reading-Ease', 'perplexity']], clear['BT_easiness'])