In [1]:
from sklearnex import patch_sklearn
patch_sklearn()

Extension for Scikit-learn* enabled (https://github.com/uxlfoundation/scikit-learn-intelex)


In [2]:
import polars as pl
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse._matrix import spmatrix
from amazon.models import *
import polars as pl
import numpy as np
import joblib
from plotly.graph_objs._figure import Figure

In [3]:
query: pl.Expr = (pl.col("rating") > 3).cast(pl.Int8).alias("rating")

lf: pl.LazyFrame = pl.scan_parquet("data/processed/amazon-2023.parquet")
lf = lf.select(["rating", "text"])
lf = lf.with_columns(query)

df = lf.head(50_000_000).collect()

In [4]:
X: pl.Series = df["text"]

In [5]:
y: pl.Series = df["rating"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

In [7]:
vectorizer = TfidfVectorizer(
    lowercase=True,
    token_pattern=r'\b\w+\b', 
    min_df=5,
    max_df=0.8,
    stop_words="english"
)

X_train: spmatrix = vectorizer.fit_transform(X_train)
X_test: spmatrix = vectorizer.transform(X_test)

model = LogisticRegression(max_iter=1_000) # iter added to stop warning
model.fit(X_train, y_train)

In [13]:
def get_metrics(y_true: np.ndarray, y_pred: np.ndarray, model_name: str) -> dict:
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=1),
        "Recall": recall_score(y_true, y_pred),
        "F1 Score": f1_score(y_true, y_pred),
        "True Negative": tn,
        "False Positive": fp,
        "False Negative": fn,
        "True Positive": tp,
    }

In [14]:
joblib.dump(model, "data/logreg_tfidf.pkl")
y_pred: np.ndarray = model.predict(X_test)
metrics: dict = get_metrics(y_test, y_pred, "Logistic Regression")

# F1= 0.919511
# f1_score(y_test, y_pred)
result = pl.DataFrame([metrics])
result

Model,Accuracy,Precision,Recall,F1 Score,True Negative,False Positive,False Negative,True Positive
str,f64,f64,f64,f64,i64,i64,i64,i64
"""Logistic Regression""",0.8918146,0.912473,0.952372,0.931996,1504779,711112,370742,7413367


In [17]:
fig: Figure = plot_confusion_matrix(result)
fig.show()