linear regression

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from scipy.sparse import hstack

# ----------------------------
# Example dataset with features
# ----------------------------

data = pd.DataFrame({
    "text": [
        "Great service and nice food",
        "Use my promo code to get 50% off!",
        "This place was okay, but noisy",
        "Visit www.somespamlink.com for deals",
        "Terrible customer service, waiting too long",
        "Absolutely loved the desserts here",
        "Click this link to win a free iPhone!",
        "Staff was friendly and welcoming",
        "Horrible food, won't come back again",
        "Limited-time offer, buy now!",
        "Perfect spot for a family dinner",
        "Worst experience, the waiter was rude",
        "Cheap sunglasses available at www.fakeads.com",
        "The ambience was calm and relaxing",
        "I waited for an hour, very disappointed",
        "Special discounts for members only, sign up!",
        "The pasta tasted homemade and fresh",
        "Scam alert: don't miss this money hack",
        "The portions were generous and filling",
        "Food was cold and bland, waste of money"
    ],
    "length": [
        5, 9, 7, 6, 7,   # first 5
        6, 9, 6, 7, 7,   # next 5
        6, 8, 8, 7, 8,   # next 5
        9, 7, 9, 7, 8    # last 5
    ],
    "sentiment": [
        0.5, 0.8, 0.1, 0.7, -0.9,   # neutral → positive / negative
        0.9, 0.95, 0.7, -0.8, 0.85,
        0.8, -0.7, 0.9, 0.6, -0.6,
        0.88, 0.9, 0.95, 0.7, -0.8
    ],
    "similarity": [
        0.3, 0.9, 0.4, 0.95, 0.2,   # some “spammy” ones high similarity
        0.4, 0.98, 0.35, 0.25, 0.9,
        0.45, 0.3, 0.97, 0.5, 0.2,
        0.93, 0.4, 0.99, 0.5, 0.2
    ],
    "label": [
        "valid", "ad", "valid", "ad", "rant",
        "valid", "ad", "valid", "rant", "ad",
        "valid", "rant", "ad", "valid", "rant",
        "ad", "valid", "ad", "valid", "rant"
    ]
})

X_train, X_test, y_train, y_test = train_test_split(
    data[["text", "length", "sentiment", "similarity"]],
    data["label"],
    test_size=0.3,
    random_state=42,
    stratify=data["label"]
)

X_train, X_test, y_train, y_test

(                                             text  length  sentiment  \
 2                  This place was okay, but noisy       7       0.10   
 7                Staff was friendly and welcoming       6       0.70   
 1               Use my promo code to get 50% off!       9       0.80   
 8            Horrible food, won't come back again       7      -0.80   
 10               Perfect spot for a family dinner       6       0.80   
 0                     Great service and nice food       5       0.50   
 6           Click this link to win a free iPhone!       9       0.95   
 13             The ambience was calm and relaxing       7       0.60   
 3            Visit www.somespamlink.com for deals       6       0.70   
 12  Cheap sunglasses available at www.fakeads.com       8       0.90   
 14        I waited for an hour, very disappointed       8      -0.60   
 15   Special discounts for members only, sign up!       9       0.88   
 18         The portions were generous and filling 

In [14]:
# ----------------------------
# Text features (TF-IDF)
# ----------------------------

# turn words into numbers
tfidf = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1,2))

X_train_tfidf = tfidf.fit_transform(X_train["text"])
X_test_tfidf = tfidf.transform(X_test["text"])

print(X_train_tfidf.shape)
print(X_test_tfidf.shape)

(14, 93)
(6, 93)


In [15]:
# ----------------------------
# Numeric features (already computed)
# ----------------------------

# store as numeric arrays
X_train_numeric = X_train[["length", "sentiment", "similarity"]].values
X_test_numeric = X_test[["length", "sentiment", "similarity"]].values

# Combine TF-IDF + numeric
X_train_final = hstack([X_train_tfidf, X_train_numeric])
X_test_final = hstack([X_test_tfidf, X_test_numeric])

print(X_train_final.shape)
print(X_test_final.shape)

(14, 96)
(6, 96)


In [None]:
# ----------------------------
# Train model
# ----------------------------
clf = LogisticRegression(max_iter=1000, class_weight="balanced")
clf.fit(X_train_final, y_train)

y_pred = clf.predict(X_test_final)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

          ad       1.00      1.00      1.00         2
        rant       1.00      1.00      1.00         2
       valid       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



In [19]:
# ----------------------------
# Try new reviews with precomputed features + true labels
# ----------------------------
new_reviews = pd.DataFrame({
    "text": [
        "Check out this new deal on electronics!",
        "I love the ambience and staff here.",
        "Horrible experience, I’m never coming back.",
        "Flash sale! Get 70% off today only!",
        "The pasta was fresh and very tasty.",
        "Scam alert: win $1000 now by clicking here!",
        "Such a cozy place for coffee and work.",
        "Don’t miss this special offer at www.fakepromo.com",
        "Friendly staff but the food was too salty.",
        "Join our membership for exclusive deals!",
        "Best burger I’ve had in a long time.",
        "Terrible wait time, service was very slow."
    ],
    "length": [7, 6, 7, 8, 7, 10, 8, 9, 9, 8, 8, 9],
    "sentiment": [0.6, 0.8, -0.9, 0.85, 0.9, -0.95, 0.7, 0.88, -0.4, 0.8, 0.95, -0.8],
    "similarity": [0.95, 0.3, 0.2, 0.97, 0.4, 0.99, 0.35, 0.98, 0.3, 0.96, 0.5, 0.25],
    "label": [
        "ad",       # promo/marketing
        "valid",    # positive real review
        "rant",     # negative complaint
        "ad",
        "valid",
        "ad",
        "valid",
        "ad",
        "rant",
        "ad",
        "valid",
        "rant"
    ]
})

# Transform inputs
new_tfidf = tfidf.transform(new_reviews["text"])
new_numeric = new_reviews[["length", "sentiment", "similarity"]].values
new_final = hstack([new_tfidf, new_numeric])

# Predictions
predictions = clf.predict(new_final)

# Compare true vs predicted
for review, true_label, pred in zip(new_reviews["text"], new_reviews["label"], predictions):
    print(f"Review: {review}\n   True: {true_label} --> Predicted: {pred}\n")


Review: Check out this new deal on electronics!
   True: ad --> Predicted: valid

Review: I love the ambience and staff here.
   True: valid --> Predicted: valid

Review: Horrible experience, I’m never coming back.
   True: rant --> Predicted: rant

Review: Flash sale! Get 70% off today only!
   True: ad --> Predicted: ad

Review: The pasta was fresh and very tasty.
   True: valid --> Predicted: valid

Review: Scam alert: win $1000 now by clicking here!
   True: ad --> Predicted: rant

Review: Such a cozy place for coffee and work.
   True: valid --> Predicted: ad

Review: Don’t miss this special offer at www.fakepromo.com
   True: ad --> Predicted: ad

Review: Friendly staff but the food was too salty.
   True: rant --> Predicted: rant

Review: Join our membership for exclusive deals!
   True: ad --> Predicted: ad

Review: Best burger I’ve had in a long time.
   True: valid --> Predicted: ad

Review: Terrible wait time, service was very slow.
   True: rant --> Predicted: rant

