### Linear regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# insert Qwen labelled data
train_data = pd.read_csv("data/label/qwen_labeled_combined_reviews.csv")

# insert hand labelled data
test_data = pd.read_csv("data/label/hand_labeled_combined_reviews.csv")

X_train= train_data[["cleaned_review_text", "rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]]
y_train= train_data["label"]

X_test=test_data[["cleaned_review_text", "rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]]
y_test=test_data["label"]

X_train, X_test, y_train, y_test

FileNotFoundError: [Errno 2] No such file or directory: 'data/label/labeled_combined_reviews.csv'

In [None]:
# turn words into numbers
tfidf = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1,2))

X_train_tfidf = tfidf.fit_transform(X_train["cleaned_review_text"])
X_test_tfidf = tfidf.transform(X_test["cleaned_review_text"])

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(14, 93)
(6, 93)


In [None]:
# Numeric features 

# store as numeric arrays
X_train_numeric = X_train[["rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]].values
X_test_numeric = X_test[["rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]].values

# Combine TF-IDF + numeric
X_train_final = hstack([X_train_tfidf, X_train_numeric])
X_test_final = hstack([X_test_tfidf, X_test_numeric])

print(X_train_final.shape)
print(X_test_final.shape)

(14, 96)
(6, 96)


In [None]:
# Train model

clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train_final, y_train)

y_pred = clf.predict(X_test_final)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

          ad       1.00      1.00      1.00         2
        rant       1.00      1.00      1.00         2
       valid       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



In [None]:
# insert new reviews if we want to show how it works
new_reviews = 


# Transform inputs
new_tfidf = tfidf.transform(new_reviews["cleaned_review_text"])
new_numeric = new_reviews[["rating", "review_length","sentiment", "all_caps_ratio","relevancy_score"]].values
new_final = hstack([new_tfidf, new_numeric])

# Predictions
predictions = clf.predict(new_final)

# Compare true vs predicted
for review, true_label, pred in zip(new_reviews["text"], new_reviews["label"], predictions):
    print(f"Review: {review}\n   True: {true_label} --> Predicted: {pred}\n")


Review: Check out this new deal on electronics!
   True: ad --> Predicted: valid

Review: I love the ambience and staff here.
   True: valid --> Predicted: valid

Review: Horrible experience, I’m never coming back.
   True: rant --> Predicted: rant

Review: Flash sale! Get 70% off today only!
   True: ad --> Predicted: ad

Review: The pasta was fresh and very tasty.
   True: valid --> Predicted: valid

Review: Scam alert: win $1000 now by clicking here!
   True: ad --> Predicted: rant

Review: Such a cozy place for coffee and work.
   True: valid --> Predicted: ad

Review: Don’t miss this special offer at www.fakepromo.com
   True: ad --> Predicted: ad

Review: Friendly staff but the food was too salty.
   True: rant --> Predicted: rant

Review: Join our membership for exclusive deals!
   True: ad --> Predicted: ad

Review: Best burger I’ve had in a long time.
   True: valid --> Predicted: ad

Review: Terrible wait time, service was very slow.
   True: rant --> Predicted: rant



In [None]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(new_reviews["label"], predictions)
print("Accuracy:", accuracy)

Accuracy: 0.6666666666666666
