In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np

In [2]:
traindf = pd.read_csv('train.csv')
valdf = pd.read_csv('valid.csv')
testdf = pd.read_csv('test.csv')

In [4]:
traindf

Unnamed: 0,sentences,contexts,labels,positions,reviewid,bookid,item_spec_score_max,item_spec_score_mean,tfidfscore,combined_embedding,...,121,122,123,124,125,126,127,clf_label,clf_proba,neighbours
0,What a fun series.,Dust,0,0.000000,0,17855756,0.015290,0.013789,0.785133,[ 1.35688037e-01 -4.24195677e-01 2.26112261e-...,...,0.136431,0.306241,0.047697,0.344808,0.247321,0.494328,-0.217180,0,0.028645,0.0
1,"I loved Wool, and Dust and Shift both gave us ...",Dust,0,0.007143,0,17855756,0.050604,0.027685,0.417833,[ 2.73589283e-01 5.14192730e-02 1.32994577e-...,...,-0.042372,0.042692,0.019526,0.195423,0.173676,0.141578,-0.039212,0,0.080332,0.0
2,"I think the first book was by far the best, bu...",Dust,0,0.014286,0,17855756,0.050604,0.026657,0.484192,[-7.83001930e-02 -3.24138403e-01 2.61863679e-...,...,0.297668,-0.002652,0.086083,-0.020139,0.024291,-0.069403,-0.231655,0,0.063516,1.0
3,It was the conclusion we wanted to see - the p...,Dust,1,0.021429,0,17855756,0.036474,0.021774,0.535509,[ 2.41805494e-01 -1.83072895e-01 3.95196617e-...,...,-0.193855,0.113906,-0.282803,0.419765,0.222623,-0.139149,-0.090754,1,0.699879,0.0
4,My problem with this book is there were lots o...,Dust,1,0.028571,0,17855756,0.036625,0.024668,0.653747,[ 9.68658254e-02 7.54294321e-02 6.13359436e-...,...,0.058654,0.006225,-0.019319,-0.196473,0.120615,-0.143962,0.279851,0,0.097708,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14290,Will read the next.,Forever Odd,0,0.021429,698,16433,0.177566,0.144543,1.000000,[ 1.07431352e-01 -3.01593512e-01 3.97471301e-...,...,0.004872,0.094335,0.336781,0.012558,-0.012272,0.594620,-0.238397,0,0.136616,0.0
14291,"If I only had one word to sum it up then ""odd"".",The Lace Reader,0,0.000000,699,1951125,0.215878,0.122285,0.631535,[ 1.24138482e-01 1.98553666e-01 4.22001667e-...,...,-0.518556,0.096743,-0.104593,0.085093,0.047637,0.067509,0.148426,0,0.401747,0.0
14292,It did keep swapping from first to third perso...,The Lace Reader,1,0.007143,699,1951125,0.143460,0.104940,0.356158,[-1.50704041e-01 8.64828080e-02 7.91704357e-...,...,0.017482,0.059031,0.003847,-0.074386,0.271997,-0.012426,0.021483,0,0.205542,0.0
14293,Some dark things happen which kind of just get...,The Lace Reader,0,0.014286,699,1951125,0.119985,0.086142,0.528640,[ 5.45209885e-01 2.03423575e-01 3.21328789e-...,...,-0.126928,0.120653,0.226310,0.314678,0.093227,-0.403474,0.314984,0,0.220800,0.0


In [7]:
# Baseline

In [9]:
feature_columns = list(map(str, list(range(128)))) 

In [11]:
clf = LogisticRegression()
clf.fit(traindf[feature_columns], traindf['labels'])
predictions = clf.predict(testdf[feature_columns])
accuracy = accuracy_score(predictions, testdf['labels'])
auc = roc_auc_score(predictions, testdf['labels'])
print('Acc:', accuracy)
print('ROC-AUC:', auc)

Acc: 0.8266666666666667
ROC-AUC: 0.70404050280847


In [13]:
# Addition positional information and item specificity

In [15]:
feature_columns = list(map(str, list(range(128)))) 
feature_columns += ['positions'] 
feature_columns += ['item_spec_score_mean'] 

In [17]:
clf = LogisticRegression()
clf.fit(traindf[feature_columns], traindf['labels'])
predictions = clf.predict(testdf[feature_columns])
accuracy = accuracy_score(predictions, testdf['labels'])
auc = roc_auc_score(predictions, testdf['labels'])
print('Acc:', accuracy)
print('ROC-AUC:', auc)

Acc: 0.8303030303030303
ROC-AUC: 0.7126250894371098


In [19]:
train_labels = clf.predict(traindf[feature_columns])
train_proba = [p[1] for p in clf.predict_proba(traindf[feature_columns])]
val_labels = clf.predict(valdf[feature_columns])
val_proba = [p[1] for p in clf.predict_proba(valdf[feature_columns])]
test_labels = clf.predict(testdf[feature_columns])
test_proba = [p[1] for p in clf.predict_proba(testdf[feature_columns])]

In [21]:
traindf['clf_label'] = train_labels
traindf['clf_proba'] = train_proba
valdf['clf_label'] = val_labels
valdf['clf_proba'] = val_proba
testdf['clf_label'] = test_labels
testdf['clf_proba'] = test_proba

In [23]:
traindf.to_csv('train.csv', index=False)
valdf.to_csv('valid.csv', index=False)
testdf.to_csv('test.csv', index=False)

In [24]:
import warnings
warnings.filterwarnings('ignore')

# Features and labels
X_train = traindf[feature_columns]
y_train = traindf['labels']
X_test = testdf[feature_columns]
y_test = testdf['labels']

# Logistic Regression and hyperparameter tuning
model = LogisticRegression(max_iter=1000000)
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],  
    'penalty': ['elasticnet'],        
    'solver': ['saga'], 
    'l1_ratio': [0, 0.1, 0.2, 0.5, 0.7, 0.9, 1.0]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='roc_auc', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Best model and evaluation
best_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

y_test_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_test_pred)
auc = roc_auc_score(y_test, y_test_pred)
print("Test Accuracy:", accuracy)
print("Test ROC-AUC:", auc)
print("Classification Report:\n", classification_report(y_test, y_test_pred))

Fitting 3 folds for each of 49 candidates, totalling 147 fits
Best hyperparameters: {'C': 10, 'l1_ratio': 1.0, 'penalty': 'elasticnet', 'solver': 'saga'}
Test Accuracy: 0.8290909090909091
Test ROC-AUC: 0.6638294118944189
Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.93      0.90      1347
           1       0.55      0.40      0.46       303

    accuracy                           0.83      1650
   macro avg       0.71      0.66      0.68      1650
weighted avg       0.81      0.83      0.82      1650



In [25]:
len(testdf)

1650

In [26]:
testdf[testdf['labels'] != testdf['clf_label']]['reviewid'].unique()

array([ 0,  1,  3,  4,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37,
       38, 39, 40, 41, 42, 43, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
       56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 74,
       75, 76, 77, 78, 79, 80, 81, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
       93, 94, 95, 96, 97, 98, 99])

In [29]:
from rich import print as rich_print

import numpy as np

import matplotlib
from matplotlib import pyplot as plt
from matplotlib import cm

RICH_x = np.linspace(1.0, 0.0, 50)
RICH_rgb = (matplotlib.colormaps.get_cmap(plt.get_cmap('RdYlBu'))(RICH_x)[:, :3] * 255).astype(np.int32)[range(20, 50, 5)]


def print_with_probs2(words, probs, prefix=None):
  def fmt(x, p, is_first=False):
    ix = int(np.clip(p * RICH_rgb.shape[0], 0, RICH_rgb.shape[0] - 1))
    r, g, b = RICH_rgb[ix]
    if is_first:
      return f'[rgb(0,0,0) on rgb({r},{g},{b})]{x}'
    else:
      return f'[rgb(0,0,0) on rgb({r},{g},{b})] {x}'
  output = []
  if prefix is not None:
    output.append(prefix)
  for i, (x, p) in enumerate(zip(words, probs)):
    output.append(fmt(x, p, is_first=i == 0))
  rich_print(''.join(output))

In [30]:
cols = ['reviewid', 'sentences', 'positions', 'labels', 'score', 'clf_label', 'clf_proba']
def get_probabilities(scores):
    return 1 / (1 + np.exp(-scores))

In [31]:
idx=1
testdf[testdf['reviewid']==idx][cols]

Unnamed: 0,reviewid,sentences,positions,labels,score,clf_label,clf_proba
18,1,"This book had interesting world, engaging char...",0.0,0,-1.829549,0,0.00978
19,1,I think almost anyone else would rate this fou...,0.011111,0,-1.754801,0,0.020837
20,1,The problem with me and this book was partiall...,0.022222,0,-1.214299,0,0.132629
21,1,I just wasn't in the right mood for the book.,0.033333,0,-1.594716,0,0.024758
22,1,Add that to the fact that I'm not a huge NA fa...,0.044444,0,-0.886286,0,0.15668
23,1,"*shrug*"") and the fact that for personal reaso...",0.055556,1,-1.055655,0,0.121839
24,1,"The writing was good, the story was fun, the w...",0.066667,0,-1.619661,0,0.017544
25,1,It just wasn't for me.,0.077778,0,-1.127678,0,0.12731


In [44]:
# indices= list(range(100))
indices = [4]
for idx in indices:
    probabilities = get_probabilities(np.array(testdf[testdf['reviewid']==idx]['score']))
    clf_proba = np.array(testdf[testdf['reviewid']==idx]['clf_label'])
    truth = np.array(testdf[testdf['reviewid']==idx]['labels']).astype(float)
    sentences = np.array(testdf[testdf['reviewid']==idx]['sentences'])
    print()
    print_with_probs2(sentences, clf_proba)
    print()
    print_with_probs2(sentences, truth)





