In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc
import random
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch
from reviewsdataset import loadBatchListwise, getReviews
from sklearn.decomposition import PCA
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler

In [4]:
reviews = getReviews()[:1000]

In [5]:
traindf = pd.concat([pd.DataFrame(loadBatchListwise(r, i)) for i, r in enumerate(reviews[:700])]).reset_index(drop=True)
valdf = pd.concat([pd.DataFrame(loadBatchListwise(r, i)) for i, r in enumerate(reviews[700:900])]).reset_index(drop=True)
testdf = pd.concat([pd.DataFrame(loadBatchListwise(r, i)) for i, r in enumerate(reviews[900:1000])]).reset_index(drop=True)

In [6]:
traindf['labels'].value_counts()

labels
0    10740
1     3555
Name: count, dtype: int64

In [10]:
scaler = MinMaxScaler()
traindf['positions'] = scaler.fit_transform(traindf[['positions']])
valdf['positions'] = scaler.fit_transform(valdf[['positions']])
testdf['positions'] = scaler.fit_transform(testdf[['positions']])

In [12]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_sentence_embedding(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the mean of the last hidden state as the sentence embedding
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze(0)
    return embeddings.numpy()

# Example: Add embeddings to your dataset
traindf['sentence_embedding'] = traindf['sentences'].apply(get_sentence_embedding)
valdf['sentence_embedding'] = valdf['sentences'].apply(get_sentence_embedding)
testdf['sentence_embedding'] = testdf['sentences'].apply(get_sentence_embedding)

In [13]:
train_embeddings = np.array(traindf['sentence_embedding'].to_list())
val_embeddings = np.array(valdf['sentence_embedding'].to_list())
test_embeddings = np.array(testdf['sentence_embedding'].to_list())

In [14]:
train_embeddings.shape

(14295, 768)

In [15]:
# Apply PCA to reduce dimensions (e.g., from 768 to 128)
pca = PCA(n_components=128)
reduced_train_embeddings = pca.fit_transform(train_embeddings)
reduced_val_embeddings = pca.transform(val_embeddings)
reduced_test_embeddings = pca.transform(test_embeddings)

In [16]:
reduced_train_embeddings.shape

(14295, 128)

In [17]:
# Convert the list of reduced embeddings to a DataFrame
embeddings_df_train = pd.DataFrame(reduced_train_embeddings, index=traindf.index)
embeddings_df_val = pd.DataFrame(reduced_val_embeddings, index=valdf.index)
embeddings_df_test = pd.DataFrame(reduced_test_embeddings, index=testdf.index)

In [18]:
# Concatenate the reduced embeddings with the original DataFrame
traindf = pd.concat([traindf, embeddings_df_train], axis=1)
valdf = pd.concat([valdf, embeddings_df_val], axis=1)
testdf = pd.concat([testdf, embeddings_df_test], axis=1)

In [19]:
# Prepare the features
feature_columns = embeddings_df_train.columns.tolist() + ['positions']

In [20]:
train_data = lgb.Dataset(
    traindf[feature_columns],
    label=traindf['labels'],
    group=[len(traindf[traindf['reviewid'] == g]) for g in traindf['reviewid'].unique()]
)

val_data = lgb.Dataset(
    valdf[feature_columns],
    label=valdf['labels'],
    group=[len(valdf[valdf['reviewid'] == g]) for g in valdf['reviewid'].unique()],
    reference=train_data
)

test_data = lgb.Dataset(
    testdf[feature_columns],
    label=testdf['labels'],
    group=[len(testdf[testdf['reviewid'] == g]) for g in testdf['reviewid'].unique()],
)

In [21]:
# Train the model using LambdaRank objective
params = {
    'objective': 'lambdarank',  # Listwise ranking
    # 'metric': 'auc',  # Evaluation metric for ranking
    'metric': 'ndcg',
    'ndcg_eval_at': [1, 3, 5],  # NDCG evaluation at different ranks
    'num_leaves': 31,
    'max_depth': -1,
    'learning_rate': 0.03,
    'boosting_type': 'gbdt',
    'min_data_in_leaf': 21
}

model = lgb.train(
    params,
    train_set=train_data,
    valid_sets=[val_data],
    valid_names=['Valid'],
    num_boost_round=500,
    callbacks=[
        lgb.early_stopping(stopping_rounds=100), 
        lgb.log_evaluation(5)
    ]
)

# After training, the best iteration (round) can be accessed
best_iteration = model.best_iteration
print(f"Best iteration: {best_iteration}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.005912 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 32759
[LightGBM] [Info] Number of data points in the train set: 14295, number of used features: 129
Training until validation scores don't improve for 100 rounds
[5]	Valid's ndcg@1: 0.64	Valid's ndcg@3: 0.697507	Valid's ndcg@5: 0.733924
[10]	Valid's ndcg@1: 0.66	Valid's ndcg@3: 0.700722	Valid's ndcg@5: 0.740435
[15]	Valid's ndcg@1: 0.68	Valid's ndcg@3: 0.718738	Valid's ndcg@5: 0.750822
[20]	Valid's ndcg@1: 0.655	Valid's ndcg@3: 0.717981	Valid's ndcg@5: 0.75047
[25]	Valid's ndcg@1: 0.655	Valid's ndcg@3: 0.711219	Valid's ndcg@5: 0.746746
[30]	Valid's ndcg@1: 0.66	Valid's ndcg@3: 0.717879	Valid's ndcg@5: 0.751974
[35]	Valid's ndcg@1: 0.665	Valid's ndcg@3: 0.714678	Valid's ndcg@5: 0.760217
[40]	Valid's ndcg@1: 0.66	Valid's ndcg@3: 0.716818	Valid's ndcg@5: 0.75833
[45]	Valid's ndcg@1: 0.675	Valid's ndcg

In [34]:
# Predict scores for test set
from sklearn.metrics import roc_auc_score
test_scores = model.predict(testdf[feature_columns], num_iteration=best_iteration)

auc = roc_auc_score(testdf['labels'], test_scores)
print(f"AUC: {auc}")

# Add scores to test dataframe and sort sentences within each group
testdf['score'] = test_scores
testdf = testdf.sort_values(by=['reviewid', 'score'], ascending=[True, False])

AUC: 0.7438826287974009


In [36]:
def get_probabilities(scores):
    return 1 / (1 + np.exp(-scores))

In [38]:
from sklearn.metrics import accuracy_score
# Calculating accuracy
predictions = []
truths = []
threshold=0.7
for i in range(100):
    probabs = get_probabilities(np.array(testdf[testdf['reviewid']==i]['score']))
    truths.extend(testdf[testdf['reviewid']==i]['labels'].tolist())
    preds = [1 if p>threshold else 0 for p in probabs]
    predictions.extend(preds)

In [40]:
accuracy_score(predictions, truths)

0.8284848484848485

In [42]:
cols = ['reviewid', 'sentences', 'positions', 'score', 'labels']

In [44]:
testdf[testdf['reviewid']==6][cols]

Unnamed: 0,reviewid,sentences,positions,score,labels
52,6,she'd been fucking his cousin how did this see...,0.044444,0.910226,1
51,6,I mean the author played it off but that's...H...,0.033333,0.635168,1
54,6,Kinda?,0.066667,0.130233,1
53,6,the author tried to play it off like she thoug...,0.055556,-0.017601,1
56,6,"but there was zero consent, in fact there was ...",0.088889,-0.061886,1
50,6,Number two so...sexual assault is a grand basi...,0.022222,-0.077634,1
55,6,Ish?,0.077778,-0.587823,1
57,6,I've almost talked myself into lower stars the...,0.1,-0.786535,0
49,6,number one - JUST SAY NO TO INSTALOVE.,0.011111,-1.782185,0
48,6,I enjoyed a lot about this book but it tanked ...,0.0,-3.845245,0


In [46]:
from reviewplot import print_with_probs

testdf = testdf.sort_values(by=['reviewid', 'positions'], ascending=[True, True])

probabilities = get_probabilities(np.array(testdf[testdf['reviewid']==5]['score']))
truth = np.array(testdf[testdf['reviewid']==5]['labels']).astype(float)
sentences = np.array(testdf[testdf['reviewid']==5]['sentences'])

In [48]:
print_with_probs(sentences, probabilities)

In [50]:
print_with_probs(sentences, truth)