# Regression Experiments
_Training regression models to see how well we can predict claimant reliablity, and what features contribute the most to the results_

In [33]:
import pandas as pd
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from util import gather_dataset

In [34]:
df = gather_dataset('./data/claimant_data_processed/')

In [35]:
def create_splits(
    df: pd.DataFrame, train: int = 70, dev: int = 10, test: int = 20
) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Split the dataset into a train, dev, and test split using the specified percentages."""

    if (train + dev + test) != 100:
        raise ValueError('The train, dev, and test splits must sum to 100.')

    train_split, temp_split = train_test_split(
        df, test_size=(dev + test) / 100, random_state=42
    )

    dev_split, test_split = train_test_split(
        temp_split, test_size=test / (dev + test), random_state=42
    )

    return train_split, dev_split, test_split


train_split, _, test_split = create_splits(df)

## Baseline

In [36]:
# fit the baseline model
dummy_regressor = DummyRegressor(strategy='mean')
dummy_regressor.fit(train_split['source'], train_split['score']) # type: ignore

# predict on the test set
predictions = dummy_regressor.predict(test_split['source']) # type: ignore

# get MSE
mse = mean_squared_error(test_split['score'], predictions)

print(f'Dummy regressor MSE: {mse:.3f}')

Dummy regressor MSE: 1.676


## Regression Models
Here we test different combinations between the following features:
* claimant embeddings
* claimant categories
* publishers
* publisher categories

In [37]:
from sklearn.svm import LinearSVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelBinarizer
from itertools import combinations
from typing import Literal, get_args
from tqdm.notebook import tqdm

Feature = Literal['claimant_embeddings', 'claimant_embeddings_context', 'claimant_category', 'publisher', 'publisher_category']

In [38]:
# add all features to the df

# claimant embeddings without context
embeddings_df = pd.read_pickle('./data/claimant_embeddings_deberta-v3-large.pickle')
df = df.merge(embeddings_df[['task_id', 'embeddings']], on='task_id')

# claimant embeddings out of context
df = df.merge(embeddings_df[['task_id', 'context_embeddings']], on='task_id')

# claimant categories
claimant_categories_df = pd.read_csv('./annotations/claimant_annotations.csv')
claimant_categories_df['claimant_type_id'] = claimant_categories_df['ClaimantType'].apply(lambda type_: int(type_.lstrip()[0]))
df = df.merge(claimant_categories_df[['task_id', 'claimant_type_id']], on='task_id')

# publisher categories
publisher_category_df = pd.read_csv('./annotations/publisher_annotations.csv')
df = df.merge(publisher_category_df[['publisher', 'category']], on='publisher')

# rename columns for clarity
df.rename(columns={
    'embeddings': 'claimant_embeddings', 
    'context_embeddings': 'claimant_embeddings_context',
    'category': 'publisher_category'
}, inplace=True)

train_split, _, test_split = create_splits(df)

train_split.head()

Unnamed: 0,worker_id,task_id,task_response_id,file_id,sentence,tokens_id,publisher,source,reliability,Optional Comments:,batch_no,score,claimant_embeddings,claimant_embeddings_context,claimant_type_id,publisher_category
10069,C6HPNPP7A7E9,3aabc46f-2715-456e-9d82-2e3814e871e8,2bf33773-a7f4-42a0-a1e1-e6dce2b2e243,LifeSiteNews_20170611T045238.conll.annot,5,1 2 3,LifeSiteNews,Priests throughout Kenya,3 - The source is potentially reliable,,4,3,"[-0.5763444900512695, -0.09419417381286621, 0....","[-0.4169674217700958, -0.5285131931304932, 0.1...",5,not established
11770,QAPRY4VGE6MY,e2efae9e-4093-40d3-8eef-2804ed02e6a9,6b594980-2e4d-4f00-a566-6b6961e27ecf,thinktwice-com_20170627T225319.conll.annot,137,1 2,www.thinktwice.com,The doctors,5 - The source is fully reliable,,2,5,"[-0.4354318380355835, -0.07199427485466003, 0....","[-0.06363115832209587, -0.6982768476009369, 0....",5,not established
1414,7R2NKCJQNXPP,c59835da-53fd-4666-9dcb-d26ec0f547a4,1aa59dc4-e15f-4336-9851-0cf7a494e469,LifeSiteNews_20170611T045238.conll.annot,5,1 2 3,LifeSiteNews,Priests throughout Kenya,4 - The source is somewhat reliable,,5,4,"[-0.5763444900512695, -0.09419417381286621, 0....","[-0.4169674217700958, -0.5285131931304932, 0.1...",5,not established
499,Z6J69GNZYEXY,241a4178-4073-422f-ac00-a79204e15ffa,b195d4a9-4cb5-4465-839e-64555536f6f2,sharylattkisson-com_20171001T192931.conll.annot,79,4 5 6 7 8 9 10,sharylattkisson.com,her colleagues at the Institute of Medicine,3 - The source is potentially reliable,,1,3,"[-0.36997485160827637, -0.04534153640270233, 0...","[-0.03363962897232601, -0.25278040054919465, 0...",5,not established
668,ATEZ9499929P,ef5fde58-e1d7-435a-b546-3ebb7b612427,974c25fe-66d8-449f-86e3-253213ec3855,International-Medical-Council-on-Vaccination_2...,61,1 2 3 4 5 6,International Medical Council on Vaccination,Vikari et al. ( 1979 .,5 - The source is fully reliable,,5,5,"[-0.5054624080657959, -0.1212393119931221, 0.1...","[-0.7636523594458898, -0.20012776831087345, -0...",6,not established


In [39]:
def create_features(
        df: pd.DataFrame,
        claimant_embeddings: bool = True,
        claimant_embeddings_context: bool = False,
        claimant_category_binarizer: LabelBinarizer|None = None,
        publisher_binarizer: LabelBinarizer|None = None,
        publisher_category_binarizer: LabelBinarizer|None = None,
    ) -> list[list[float|int]]:
    """Uses columns to create a feature array for training the regression model"""

    X = [[] for _ in range(len(df))]
    for idx, (_, row) in enumerate(df.iterrows()):

        # add word embeddings to the feature
        if claimant_embeddings:
            X[idx].extend(list(row['claimant_embeddings'])) # type: ignore

        if claimant_embeddings_context:
            X[idx].extend(list(row['claimant_embeddings_context'])) # type: ignore

        if claimant_category_binarizer: # if a claimant category binarizer was passed, add as feature
            X[idx].extend(claimant_category_binarizer.transform([row['claimant_type_id']])[0])

        if publisher_binarizer: # if a publisher binarizer was passed, add as feature
            X[idx].extend(publisher_binarizer.transform([row['publisher']])[0])

        if publisher_category_binarizer: # if a publisher category binarizer was passed, add as feature
            X[idx].extend(publisher_category_binarizer.transform([row['publisher_category']])[0])
            
        

    return X

# initialize label binarizers
claimant_category_binarizer = LabelBinarizer().fit(df['claimant_type_id'])
publisher_binarizer = LabelBinarizer().fit(df['publisher'])
publisher_category_binarizer = LabelBinarizer().fit(df['publisher_category'])

binarizers: dict[Feature, LabelBinarizer] = {'claimant_category': claimant_category_binarizer, 'publisher': publisher_binarizer, 'publisher_category': publisher_category_binarizer}

In [50]:
MSE = float
RSQUARED = float

def run_experiment(train_split: pd.DataFrame, test_split: pd.DataFrame, binarizers: dict[Feature, LabelBinarizer], features: tuple[Feature,...], model:LinearSVR|LinearRegression) -> tuple[MSE, RSQUARED]:
    """Extracts the correct features, trains a regression model, and returns the MSE on the test set"""

    X_train = create_features(
        train_split,
        claimant_embeddings=True if 'claimant_embeddings' in features else False,
        claimant_embeddings_context=True if 'claimant_embeddings_context' in features else False,
        claimant_category_binarizer=binarizers['claimant_category'] if 'claimant_category' in features else None,
        publisher_binarizer=binarizers['publisher'] if 'publisher' in features else None,
        publisher_category_binarizer=binarizers['publisher_category'] if 'publisher_category' in features else None,
    )
    y_train = list(train_split['score'])

    model.fit(X_train, y_train)

    X_test = create_features(
        test_split,
        claimant_embeddings=True if 'claimant_embeddings' in features else False,
        claimant_embeddings_context=True if 'claimant_embeddings_context' in features else False,
        claimant_category_binarizer=binarizers['claimant_category'] if 'claimant_category' in features else None,
        publisher_binarizer=binarizers['publisher'] if 'publisher' in features else None,
        publisher_category_binarizer=binarizers['publisher_category'] if 'publisher_category' in features else None,
    )
    y_test = list(test_split['score'])

    predictions = model.predict(X_test)

    mse = mean_squared_error(y_test, predictions)
    r2 = model.score(X_test, y_test)

    return mse, r2

# code to test function definition
model = LinearRegression()
mse, r2 = run_experiment(train_split, test_split, binarizers, ('claimant_embeddings', 'publisher'), model)
print(f'Test experiment MSE: {mse:.3f}, and R2: {r2:.3f}')

Test experiment MSE: 1.135, and R2: 0.323


In [41]:
# run experiment for all combinations of features 
features = get_args(Feature)
feature_combinations = tuple(combination for num in range(1, len(features) + 1) for combination in combinations(features, num))
# remove occurances where both types of embeddings occur together
feature_combinations = tuple(combination for combination in feature_combinations if not ('claimant_embeddings' in combination and 'claimant_embeddings_context' in combination))

In [42]:
features_df = df[['claimant_embeddings', 'claimant_embeddings_context', 'claimant_type_id', 'publisher', 'publisher_category']]
features_df.head()

Unnamed: 0,claimant_embeddings,claimant_embeddings_context,claimant_type_id,publisher,publisher_category
0,"[-0.3007359802722931, -0.060107577592134476, 0...","[0.21119636182601637, -0.3454102518287702, 0.0...",2,ThinkProgress,not established
1,"[-0.3007359802722931, -0.060107577592134476, 0...","[0.21119636182601637, -0.3454102518287702, 0.0...",2,ThinkProgress,not established
2,"[-0.5612852573394775, -0.03220456838607788, 0....","[0.17742657661437988, -0.19991786777973175, 0....",6,PublicHealth.org,established
3,"[-0.3007359802722931, -0.060107577592134476, 0...","[0.21119636182601637, -0.3454102518287702, 0.0...",2,ThinkProgress,not established
4,"[-0.5341578125953674, -0.10339929163455963, 0....","[-0.5530586391687393, 0.4381181299686432, 0.44...",2,National Vaccine Information Center (NVIC),governmental/institutional


In [43]:
# test for multicolinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor 

vif_scores = {feature: None for feature in features}

# print([variance_inflation_factor(features_df.values, i) for i in range(len(features_df.columns))])

print(f'{"feature":30}{"VIF"}')
print('---'*15)
for feature, vif_score in vif_scores.items():
    print(f'{feature:30}{vif_score}')

feature                       VIF
---------------------------------------------
claimant_embeddings           None
claimant_embeddings_context   None
claimant_category             None
publisher                     None
publisher_category            None


In [47]:
# run linear regression model
model = LinearRegression()
results = {}
for feature_combination in tqdm(feature_combinations, desc='Running Linear Regression experiments'):
    mse, r2 = run_experiment(train_split, test_split, binarizers, feature_combination, model)
    results[feature_combination] = (mse, r2)

# print results in order of best performing
for combination, (mse, r2) in  sorted(results.items(), key=lambda x: x[1]):
    print(f'{" & ".join(combination):_<80} MSE: {mse:.5f} R2: {r2:.5f}')

Running Linear Regression experiments:   0%|          | 0/23 [00:00<?, ?it/s]

claimant_embeddings & claimant_category & publisher_category____________________ MSE: 1.12956 R2: 0.32584
claimant_embeddings & publisher_category________________________________________ MSE: 1.12979 R2: 0.32570
claimant_embeddings & claimant_category & publisher_____________________________ MSE: 1.13462 R2: 0.32282
claimant_embeddings & claimant_category & publisher & publisher_category________ MSE: 1.13462 R2: 0.32282
claimant_embeddings & publisher_________________________________________________ MSE: 1.13469 R2: 0.32278
claimant_embeddings & publisher & publisher_category____________________________ MSE: 1.13484 R2: 0.32269
claimant_embeddings & claimant_category_________________________________________ MSE: 1.13520 R2: 0.32247
claimant_embeddings_____________________________________________________________ MSE: 1.13643 R2: 0.32173
claimant_embeddings_context & claimant_category_________________________________ MSE: 1.13904 R2: 0.32018
claimant_embeddings_context___________________

In [53]:
# run more robust LinearSVR model
model = LinearSVR(dual=False, loss='squared_epsilon_insensitive')
results = {}
for feature_combination in tqdm(feature_combinations, desc='Running LinearSVR experiments'):
    mse, r2 = run_experiment(train_split, test_split, binarizers, feature_combination, model)
    results[feature_combination] = (mse, r2)

# print results in order of best performing
for combination, (mse, r2) in  sorted(results.items(), key=lambda x: x[1]):
    print(f'{" & ".join(combination):_<80} MSE: {mse:.5f} R2: {r2:.5f}')

Running LinearSVR experiments:   0%|          | 0/23 [00:00<?, ?it/s]

claimant_embeddings & claimant_category & publisher_____________________________ MSE: 1.09784 R2: 0.34477
claimant_embeddings & publisher & publisher_category____________________________ MSE: 1.09960 R2: 0.34372
claimant_embeddings & claimant_category & publisher & publisher_category________ MSE: 1.09971 R2: 0.34365
claimant_embeddings & publisher_________________________________________________ MSE: 1.10551 R2: 0.34019
claimant_embeddings & publisher_category________________________________________ MSE: 1.11756 R2: 0.33300
claimant_embeddings & claimant_category & publisher_category____________________ MSE: 1.11896 R2: 0.33216
claimant_embeddings_context & publisher_________________________________________ MSE: 1.12416 R2: 0.32906
claimant_embeddings_context & claimant_category & publisher_____________________ MSE: 1.12506 R2: 0.32852
claimant_embeddings_context & publisher & publisher_category____________________ MSE: 1.12529 R2: 0.32838
claimant_embeddings_context & claimant_categor