### Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# Insert Qwen labelled data
train_data = pd.read_csv("../data/with_features/qwen_labelled_combined_reviews_with_features.csv")

# Insert hand labelled data
test_data = pd.read_csv("../data/with_features/hand_labelled_combined_reviews_with_features.csv")

X_train= train_data[["review_text", "rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]]
y_train= train_data["label"]

X_test=test_data[["review_text", "rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]]
y_test=test_data["label"]

X_train, X_test, y_train, y_test

(                                   cleaned_review_text  rating  review_length  \
 0    andrea is amazing our dog loves her and she al...       5              9   
 1    andrea does a wonderful  job  with our wild pr...       5             11   
 2                                    never called back       1              1   
 3                          they dont answer the phones       3              3   
 4                   limited information on the website       3              3   
 ..                                                 ...     ...            ...   
 995                          brett williams is awesome       5              3   
 996  they have they the kinds of cars a teen will n...       5              9   
 997  i would not go back there for nothing the gent...       1             10   
 998                                     awesome prices       5              2   
 999                           not the best but alright       2              2   
 
     sentiment

In [None]:
# Turn words into numbers
tfidf = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1,2))

X_train_tfidf = tfidf.fit_transform(X_train["review_text"])
X_test_tfidf = tfidf.transform(X_test["review_text"])

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [9]:
# Numeric features 

# Store as numeric arrays
numeric_cols = ["rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]
X_train_numeric = X_train[numeric_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values
X_test_numeric  = X_test[numeric_cols].apply(pd.to_numeric, errors='coerce').fillna(0).values

# Combine TF-IDF + numeric
X_train_final = hstack([X_train_tfidf, X_train_numeric])
X_test_final = hstack([X_test_tfidf, X_test_numeric])

print(X_train_final.shape)
print(X_test_final.shape)

(1000, 5005)
(200, 5005)


In [10]:
# Train model

clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train_final, y_train)

y_pred = clf.predict(X_test_final)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

          Ad       0.20      0.20      0.20        10
  Irrelevant       0.10      0.71      0.18         7
        Rant       0.79      0.79      0.79        14
       Valid       0.94      0.70      0.81       169

    accuracy                           0.69       200
   macro avg       0.51      0.60      0.49       200
weighted avg       0.87      0.69      0.75       200



In [14]:
# Insert new reviews if we want to show how it works
new_reviews = pd.DataFrame([
    {
        "review_text": "This app is amazing! It helped me organize all my tasks efficiently.",
        "cleaned_review_text": "app amazing helped organize tasks efficiently",
        "rating": 5,
        "review_length": 10,
        "sentiment": 1,  # positive
        "all_caps_ratio": 0,
        "relevancy_score": 0.9,
        "label": 4  # Valid
    },
    {
        "review_text": "Buy the new SuperWidget now! Limited stock, get yours today!",
        "cleaned_review_text": "buy new superwidget limited stock get yours today",
        "rating": 5,
        "review_length": 10,
        "sentiment": 1,
        "all_caps_ratio": 0,
        "relevancy_score": 0.95,
        "label": 1  # Ad
    },
    {
        "review_text": "I waited 30 minutes and nobody helped me. Terrible service!",
        "cleaned_review_text": "waited 30 minutes nobody helped terrible service",
        "rating": 1,
        "review_length": 10,
        "sentiment": -1,
        "all_caps_ratio": 0,
        "relevancy_score": 0.8,
        "label": 3  # Rant
    },
    {
        "review_text": "I saw some random fact about dolphins today, nothing to do with this app.",
        "cleaned_review_text": "saw random fact dolphins today nothing do app",
        "rating": 3,
        "review_length": 12,
        "sentiment": 0,
        "all_caps_ratio": 0,
        "relevancy_score": 0.2,
        "label": 2  # Irrelevant
    },
])


# Transform inputs
new_tfidf = tfidf.transform(new_reviews["review_text"])
new_numeric = new_reviews[["rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]].values
new_final = hstack([new_tfidf, new_numeric])

# Predictions
predictions = clf.predict(new_final)

# Compare true vs predicted
for review, true_label, pred in zip(new_reviews["text"], new_reviews["label"], predictions):
    print(f"Review: {review}\n   True: {true_label} --> Predicted: {pred}\n")


NotFittedError: Vocabulary not fitted or provided

In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(new_reviews["label"], predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6666666666666666
