In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import torch
import torch.nn.functional as F
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, auc
import random
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from transformers import BertModel, BertTokenizer
import torch.nn as nn
import torch
from reviewsdataset import loadBatchListwise, getReviews
from itemspecificity import getItemSpecificity
from sklearn.decomposition import PCA
import lightgbm as lgb
from sklearn.preprocessing import MinMaxScaler

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anaykulkarni/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anaykulkarni/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
reviews = getReviews()[:1000]

In [6]:
doc_freq, inv_item_freq = getItemSpecificity(reviews)

In [8]:
from nltk.corpus import stopwords
import string
sw = stopwords.words('english')
sp = string.punctuation

def get_item_specificity(word, bookid):
    return doc_freq[bookid][word] * inv_item_freq[word]

def get_score_by_sentence_max(row):
    text, bookid = row['sentences'], row['bookid']
    words = [word for word in text.split() if word.lower() not in sw]
    cleaned_words = [''.join([c for c in word if c not in sp]) for word in words if len(word)>1]
    if len(cleaned_words) < 1:
        return 0
    score = np.max([get_item_specificity(w, bookid) for w in cleaned_words])
    return score

def get_score_by_sentence_mean(row):
    text, bookid = row['sentences'], row['bookid']
    words = [word for word in text.split() if word.lower() not in sw]
    cleaned_words = [''.join([c for c in word if c not in sp]) for word in words if len(word)>1]
    if len(cleaned_words) < 1:
        return 0
    score = np.mean([get_item_specificity(w, bookid) for w in cleaned_words])
    return score

In [10]:
traindf = pd.concat([pd.DataFrame(loadBatchListwise(r, i)) for i, r in enumerate(reviews[:700])]).reset_index(drop=True)
valdf = pd.concat([pd.DataFrame(loadBatchListwise(r, i)) for i, r in enumerate(reviews[700:900])]).reset_index(drop=True)
testdf = pd.concat([pd.DataFrame(loadBatchListwise(r, i)) for i, r in enumerate(reviews[900:1000])]).reset_index(drop=True)

In [12]:
traindf['item_spec_score_max'] = traindf.apply(get_score_by_sentence_max, axis=1)
valdf['item_spec_score_max'] = valdf.apply(get_score_by_sentence_max, axis=1)
testdf['item_spec_score_max'] = testdf.apply(get_score_by_sentence_max, axis=1)
traindf['item_spec_score_mean'] = traindf.apply(get_score_by_sentence_mean, axis=1)
valdf['item_spec_score_mean'] = valdf.apply(get_score_by_sentence_mean, axis=1)
testdf['item_spec_score_mean'] = testdf.apply(get_score_by_sentence_mean, axis=1)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english',  # Remove common stopwords
                                   ngram_range=(1, 1),
                                   norm='l2',
                                   lowercase=True,
                                   use_idf=True,
                                   smooth_idf=True)
# traindf['tfidfscore'] = np.array(tfidf_vectorizer.fit_transform(traindf['sentences']).sum(axis=1))
# valdf['tfidfscore'] = np.array(tfidf_vectorizer.transform(valdf['sentences']).sum(axis=1))
# testdf['tfidfscore'] = np.array(tfidf_vectorizer.transform(testdf['sentences']).sum(axis=1))

# Train data: Compute max TF-IDF score for each sentence
traindf['tfidfscore'] = np.array(
    tfidf_vectorizer.fit_transform(traindf['sentences']).max(axis=1).toarray()
).flatten()

# Validation data: Compute max TF-IDF score for each sentence
valdf['tfidfscore'] = np.array(
    tfidf_vectorizer.transform(valdf['sentences']).max(axis=1).toarray()
).flatten()

# Test data: Compute max TF-IDF score for each sentence
testdf['tfidfscore'] = np.array(
    tfidf_vectorizer.transform(testdf['sentences']).max(axis=1).toarray()
).flatten()

In [18]:
scaler = MinMaxScaler()
traindf['positions'] = scaler.fit_transform(traindf[['positions']])
valdf['positions'] = scaler.fit_transform(valdf[['positions']])
testdf['positions'] = scaler.fit_transform(testdf[['positions']])

In [20]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_combined_embedding(row):
    # Combine the 'sentence' and 'context' columns
    # combined_text = f"{row['sentences']}" # [SEP] {row['contexts']}"
    # Tokenize the combined text
    inputs = tokenizer(row['sentences'], return_tensors="pt", truncation=True, padding=True, max_length=128)
    # Pass through the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    # Use the pooler_output (CLS token embedding) as the sentence embedding
    # embedding = outputs.pooler_output.squeeze(0)
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze(0)
    return embedding.numpy()

# Apply the function to calculate embeddings
traindf['combined_embedding'] = traindf.apply(get_combined_embedding, axis=1)
valdf['combined_embedding'] = valdf.apply(get_combined_embedding, axis=1)
testdf['combined_embedding'] = testdf.apply(get_combined_embedding, axis=1)

In [26]:
train_embeddings = np.array(traindf['combined_embedding'].to_list())
val_embeddings = np.array(valdf['combined_embedding'].to_list())
test_embeddings = np.array(testdf['combined_embedding'].to_list())

In [28]:
train_embeddings.shape

(14295, 768)

In [30]:
# Apply PCA to reduce dimensions (e.g., from 768 to 128)
pca = PCA(n_components=128)
reduced_train_embeddings = pca.fit_transform(train_embeddings)
reduced_val_embeddings = pca.transform(val_embeddings)
reduced_test_embeddings = pca.transform(test_embeddings)

In [32]:
reduced_train_embeddings.shape

(14295, 128)

In [34]:
# Convert the list of reduced embeddings to a DataFrame
embeddings_df_train = pd.DataFrame(reduced_train_embeddings, index=traindf.index)
embeddings_df_val = pd.DataFrame(reduced_val_embeddings, index=valdf.index)
embeddings_df_test = pd.DataFrame(reduced_test_embeddings, index=testdf.index)

In [38]:
# Concatenate the reduced embeddings with the original DataFrame
traindf = pd.concat([traindf, embeddings_df_train], axis=1)
valdf = pd.concat([valdf, embeddings_df_val], axis=1)
testdf = pd.concat([testdf, embeddings_df_test], axis=1)

In [40]:
traindf

Unnamed: 0,sentences,contexts,labels,positions,reviewid,bookid,item_spec_score_max,item_spec_score_mean,tfidfscore,combined_embedding,...,118,119,120,121,122,123,124,125,126,127
0,What a fun series.,Dust,0,0.000000,0,17855756,0.015290,0.013789,0.785133,"[0.13568804, -0.42419568, 0.22611226, 0.193501...",...,-0.435559,-0.028487,0.224099,0.136431,0.306240,0.047697,0.344808,0.247321,0.494328,-0.217180
1,"I loved Wool, and Dust and Shift both gave us ...",Dust,0,0.007143,0,17855756,0.050604,0.027685,0.417833,"[0.27358928, 0.051419273, 0.13299458, 0.160809...",...,-0.457130,-0.071240,0.153160,-0.042372,0.042692,0.019526,0.195423,0.173676,0.141578,-0.039212
2,"I think the first book was by far the best, bu...",Dust,0,0.014286,0,17855756,0.050604,0.026657,0.484192,"[-0.07830019, -0.3241384, 0.26186368, -0.03014...",...,-0.182080,0.103592,0.182030,0.297668,-0.002652,0.086083,-0.020139,0.024291,-0.069403,-0.231655
3,It was the conclusion we wanted to see - the p...,Dust,1,0.021429,0,17855756,0.036474,0.021774,0.535509,"[0.2418055, -0.1830729, 0.39519662, -0.0359889...",...,-0.044703,-0.406086,0.235105,-0.193855,0.113906,-0.282803,0.419765,0.222623,-0.139149,-0.090754
4,My problem with this book is there were lots o...,Dust,1,0.028571,0,17855756,0.036625,0.024668,0.653747,"[0.096865825, 0.07542943, 0.061335944, 0.11228...",...,0.320965,-0.098903,0.168757,0.058654,0.006225,-0.019319,-0.196473,0.120615,-0.143962,0.279851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14290,Will read the next.,Forever Odd,0,0.021429,698,16433,0.177566,0.144543,1.000000,"[0.10743135, -0.3015935, 0.03974713, 0.0797066...",...,0.358973,-0.126794,0.024562,0.004872,0.094335,0.336781,0.012558,-0.012272,0.594620,-0.238397
14291,"If I only had one word to sum it up then ""odd"".",The Lace Reader,0,0.000000,699,1951125,0.215878,0.122285,0.631535,"[0.12413848, 0.19855367, 0.042200167, 0.019212...",...,-0.060622,-0.015157,-0.239266,-0.518556,0.096743,-0.104593,0.085093,0.047637,0.067509,0.148426
14292,It did keep swapping from first to third perso...,The Lace Reader,1,0.007143,699,1951125,0.143460,0.104940,0.356158,"[-0.15070404, 0.08648281, 0.079170436, 0.15373...",...,-0.363667,0.361313,-0.013605,0.017482,0.059031,0.003847,-0.074386,0.271997,-0.012426,0.021483
14293,Some dark things happen which kind of just get...,The Lace Reader,0,0.014286,699,1951125,0.119985,0.086142,0.528640,"[0.5452099, 0.20342357, 0.3213288, 0.1988398, ...",...,-0.199145,-0.415404,0.013450,-0.126928,0.120654,0.226310,0.314678,0.093227,-0.403474,0.314984


In [72]:
traindf.to_csv('train.csv', index=False)
valdf.to_csv('valid.csv', index=False)
testdf.to_csv('test.csv', index=False)