In [1]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd

In [2]:
df = pd.read_pickle('./p4k_reviews_dataset_processed.pkl')

In [3]:
df = df[['text', 'score']]

In [4]:
df

Unnamed: 0,text,score
0,"“Trip-hop” eventually became a ’90s punchline,...",9.3
1,"Eight years, five albums, and two EPs in, the ...",7.9
2,Minneapolis’ Uranium Club seem to revel in bei...,7.3
3,Kleenex began with a crash. It transpired one ...,9.0
4,It is impossible to consider a given release b...,8.1
...,...,...
17995,"When last we heard from Saint Etienne, on last...",7.7
17996,Some things are unfairly dependent on the vaga...,5.6
17997,For a territory only slightly larger than New ...,7.9
17998,"PSAs on TV tell kids ""don't smoke"" and ""just s...",6.0


In [5]:
nMax = 60

df = df.groupby('score').apply(lambda x: x.sample(n=min(nMax, len(x))))
pd.set_option('display.max_rows', 500)

In [6]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,text,score
score,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,15899,It could be said that Liz Phair's greatest ass...,0.0
0.0,13899,If more drunks would learn from Robert Pollard...,0.0
0.0,11067,"""Indie's Biggest Hits""? ""Volume 1""? ""NOW that'...",0.0
0.0,14537,Travis Morrison got his ass kicked. He tells t...,0.0
0.1,17330,"Contrary to what you may believe, it's hard to...",0.1
...,...,...,...
10.0,14437,"In a 1994 interview with Option magazine, Stev...",10.0
10.0,857,"In 1987, Prince Rogers Nelson was in transitio...",10.0
10.0,355,"In July 1975, Brian Eno found himself a few da...",10.0
10.0,862,"At the dawn of the 1980s, young black musician...",10.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['score'], test_size=0.1, random_state=42)

In [8]:
# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [9]:
# Initialize and train the SVR model
svr = SVR(kernel='linear')
svr.fit(X_train, y_train)

In [10]:
# Make predictions on the test set
y_pred = svr.predict(X_test)

In [11]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

Mean Squared Error: 1.7342944156492056


In [12]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Root Mean Squared Error: {rmse}")

Root Mean Squared Error: 1.3173695420887786


In [13]:
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 0.959727360866147


In [14]:
r2 = r2_score(y_test, y_pred)
print(f"r2 score: {r2}")

r2 score: 0.599437359121192


In [16]:
from scipy.stats import pearsonr
r = pearsonr(y_test, y_pred)
print(f"r2 score: {r}")

r2 score: PearsonRResult(statistic=0.7746585326946513, pvalue=1.0152322253274135e-70)
