# (PSL) Project 3: Preprocessing Word Embeddings

This code reads in Split 1 training data, fits a binary classifier model to it (Logistic Regression with ElasticNet and Standard Scaler), and pickles and saves the model.

In [None]:
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

train = pd.read_csv('F24_Proj3_data/split_1/train.csv')
X_train = train.iloc[:, 3:]
y_train = train['sentiment']

model_pipe = make_pipeline(
    StandardScaler(),
    LogisticRegression(C=0.01, l1_ratio=0.1, penalty='elasticnet', 
                       solver='saga', max_iter=10000, random_state=5671))
model_pipe.fit(X_train, y_train);

# import pickle
# with open('split_1_model.pkl', 'wb') as f:
#     pickle.dump(model_pipe, f)


This code reads in split 1 test data, runs the pretrained model on it to make predictions of positive sentiment for each review.

In [None]:
import numpy as np

import warnings
warnings.filterwarnings("ignore")

# Load the Split 1 test data, use the pretrained model to predict probability of positive label for each review
test = pd.read_csv('F24_Proj3_data/split_1/test.csv')
test['review_pred_og'] = model_pipe.predict_proba(test.iloc[:, 2:])[:, 1]

This code takes a sample of test reviews and their OpenAI embeddings (1000 x 1536), and uses BERT model to generate corresponsing BERT embeddings (1000 x 768). Then, perform linear regression to approximate Open AI embeddings from BERT embeddings, save the transformation matrix CSV.

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load distilBERT, pretrained tokenizer and model
bert_tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = AutoModel.from_pretrained("distilbert-base-uncased")

# Sample test data, extract the 1536 OpenAI embeddings
np.random.seed(287)
samp_rev_ids = np.random.choice(test['id'], size=10000, replace=False)
sample_test = test.loc[test['id'].isin(samp_rev_ids), :]
sample_test_openai = sample_test.drop(columns=['id','review','review_pred_og'])

# For each review in the sample, tokenize and convert to BERT embeddings
sample_test_bert_matrix = []
for rev in sample_test['review']:
    these_tokens = bert_tokenizer(rev, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        this_output = bert_model(**these_tokens)
        this_emb = this_output.last_hidden_state.mean(dim=1).numpy()

    sample_test_bert_matrix.append(this_emb.flatten())

sample_test_bert = pd.DataFrame(np.array(sample_test_bert_matrix))


# Fit a linear regression model to approximate OpenAI embeddings from Bert embeddings
from sklearn.linear_model import LinearRegression


W_model = LinearRegression()
W_model.fit(sample_test_bert, sample_test_openai)
# The transformation matrix W are the model's coefficients
W = W_model.coef_.T  # Shape will be (768, 1536)

# # Save csv
# mysubmission_df = pd.DataFrame(W)
# mysubmission_df.to_csv('W.csv', index=False)


This code samples 5 positive and 5 negative reviews with their original OpenAI embeddings, from Split 1 test data.

In [None]:
# From Split 1 test data, list review ids of positive 
# labels (> 0.5 prob of positive), and negative (otherwise)
pos_ids = test.loc[test['review_pred_og'] > 0.5, 'id']
neg_ids = test.loc[test['review_pred_og'] < 0.5, 'id']

# Set random seed, and sample 5 pos and 5 neg reviews
np.random.seed(287)
pos_ids_sample = np.random.choice(pos_ids, size=5, replace=False)
neg_ids_sample = np.random.choice(neg_ids, size=5, replace=False)

# A sample of 10 reviews, 5 predicted positive and 5 negative
ids_sample = np.concatenate([pos_ids_sample, neg_ids_sample])
interp_df_full = test.loc[test['id'].isin(ids_sample), :]

# # Save to csv
# interp_df_full.to_csv('sample_reviews.csv', index=False)


NameError: name 'test' is not defined

This code splits the 10 sample reviews into sentences.

In [None]:
# Separate embeddings matrix (10 x 1536) and other columns
interp_df_only_embed = interp_df_full.drop(columns=['id', 'review', 'review_pred_og'])
interp_df_no_embed = interp_df_full[['id', 'review', 'review_pred_og']]

import re
# Function to split review into sentences
def by_sentence(review):
    return re.split(r'(?<=[.!?])\s*(?=\w)', review)

# From reviews, remove <br> and extra spaces
interp_df_no_embed['review'] = interp_df_no_embed['review'].str.replace(r'<.*?>', ' ', regex=True).str.strip()
# Apply split function to 'review' column
interp_df_no_embed['sentence'] = interp_df_no_embed['review'].apply(by_sentence)
# Explode the 'sentence' column (each row is a sentence), remove extra spaces
interp_df_split = interp_df_no_embed.explode('sentence')
interp_df_split['sentence'] = interp_df_split['sentence'].str.strip()

# interp_df_split


This code converts each sample sentence to BERT embeddings, then uses the precalculated conversion matrix to convert the BERT embeddings to approximate OpenAI embeddings, aligning them with the original embedding format and dimension.

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load distilBERT, pretrained tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
emb_model = AutoModel.from_pretrained("distilbert-base-uncased")
sentences_to_bert_matrix = []

# Iterate over each sentence in sample reviews
for sen in interp_df_split['sentence']:
    # Tokenize this sentence
    this_input = tokenizer(sen, return_tensors="pt", 
                       padding=True, truncation=True) # Control input token size

    # Generate embeddings, average over tokens to get fixed-length
    with torch.no_grad():
        this_output = emb_model(**this_input)
        this_emb = this_output.last_hidden_state.mean(dim=1).numpy()

    # Flatten and append embedding to output list
    sentences_to_bert_matrix.append(this_emb.flatten())

# The matrix of 768 BERT embeddings, with a row for each sample sentence
sentences_to_bert = pd.DataFrame(np.array(sentences_to_bert_matrix))



# Use saved W matrix to convert this df from 768 BERT embeddings to 1536 OpenAI embeddings
sentences_to_openai = sentences_to_bert @ W
# Run the pretrained model to predict probability of positive sentiment
model_pipe.predict_proba(sentences_to_openai)
