In [1]:
# Cell X: Create clean_content if not present
import spacy
import pandas as pd
import re

df = pd.read_csv('../data/tinder_reviews_randomized.csv')

# Load spaCy English model (download if needed: python -m spacy download en_core_web_sm)
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser"])

def clean_and_lemmatize(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\.\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = text.replace('!', ' _EXCLAMATION_ ').replace('?', ' _QUESTION_ ')
    text = re.sub(r'[^a-zA-Z0-9_\s]', '', text)
    doc = nlp(text)
    negations = {"no", "not", "nor", "never"}
    tokens = [
        token.lemma_ for token in doc
        if (token.is_alpha or token.text in ["_EXCLAMATION_", "_QUESTION_"])
        and (token.text not in nlp.Defaults.stop_words or token.text in negations)
    ]
    return ' '.join(tokens)

df['clean_content'] = df['content'].astype(str).apply(clean_and_lemmatize)
print(df['clean_content'])

0                                               click bait
1                            platinum user no like profile
2        appreciate guy blatantly disrespect unprofessi...
3        account review question swipe leave dozen bot ...
4                                             cool not pay
                               ...                        
49995                                       good bad bunch
49996                                        well date app
49997                                                 good
49998    barrel term push microtransaction invite tinde...
49999    keep get ban not wrong say ban tinder price no...
Name: clean_content, Length: 50000, dtype: object


In [2]:
# Cell 1: Imports and Data Loading
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# If not already present, create clean_content using your cleaning pipeline from EDA
# If already present, skip this step

# Cell 2: Label Creation (binary: 1=positive, 0=negative, drop neutral)
def label_sentiment(row):
    if row['score'] >= 4:
        return 1
    elif row['score'] <= 2:
        return 0
    else:
        return np.nan  # Neutral

df['sentiment'] = df.apply(label_sentiment, axis=1)
df = df.dropna(subset=['sentiment', 'clean_content'])

# Cell 3: Train/Test Split (stratified)
X = df['clean_content'].values
y = df['sentiment'].values.astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

In [3]:
print(df.columns)

Index(['reviewId', 'userName', 'userImage', 'content', 'score',
       'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent',
       'repliedAt', 'appVersion', 'star', 'clean_content', 'sentiment'],
      dtype='object')


In [5]:
# Cell 4: VADER Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

sia = SentimentIntensityAnalyzer()

def vader_predict(text):
    score = sia.polarity_scores(text)['compound']
    return 1 if score >= 0.05 else 0

y_pred_vader = [vader_predict(text) for text in X_test]

# Cell 5: Evaluation for VADER
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

print("VADER Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_vader))
print("Precision:", precision_score(y_test, y_pred_vader))
print("Recall:", recall_score(y_test, y_pred_vader))
print("F1:", f1_score(y_test, y_pred_vader))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_vader))
print(classification_report(y_test, y_pred_vader))

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/akshathm/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


VADER Results:
Accuracy: 0.7544166666666666
Precision: 0.7372183372183372
Recall: 0.7906666666666666
F1: 0.7630076397265783
Confusion Matrix:
 [[4309 1691]
 [1256 4744]]
              precision    recall  f1-score   support

           0       0.77      0.72      0.75      6000
           1       0.74      0.79      0.76      6000

    accuracy                           0.75     12000
   macro avg       0.76      0.75      0.75     12000
weighted avg       0.76      0.75      0.75     12000



In [6]:
# Cell 6: TF-IDF + Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

tfidf = TfidfVectorizer(ngram_range=(1,2), max_features=20000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

clf = LogisticRegression(max_iter=200, random_state=42)
clf.fit(X_train_tfidf, y_train)
y_pred_lr = clf.predict(X_test_tfidf)

# Cell 7: Evaluation for Logistic Regression
print("Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall:", recall_score(y_test, y_pred_lr))
print("F1:", f1_score(y_test, y_pred_lr))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression Results:
Accuracy: 0.8715833333333334
Precision: 0.8822218412480713
Recall: 0.8576666666666667
F1: 0.8697709794642102
Confusion Matrix:
 [[5313  687]
 [ 854 5146]]
              precision    recall  f1-score   support

           0       0.86      0.89      0.87      6000
           1       0.88      0.86      0.87      6000

    accuracy                           0.87     12000
   macro avg       0.87      0.87      0.87     12000
weighted avg       0.87      0.87      0.87     12000



In [11]:
import os
os.makedirs('../models', exist_ok=True)
import joblib
joblib.dump(clf, "../models/logreg_model.pkl")
joblib.dump(tfidf, "../models/tfidf_vectorizer.pkl")

['../models/tfidf_vectorizer.pkl']

In [12]:
!pip install "transformers[torch]"



In [13]:
# Cell 8: HuggingFace Transformers Setup
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Choose model: 'distilbert-base-uncased' or 'microsoft/deberta-v3-small'
MODEL_NAME = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)

# Prepare HuggingFace Dataset
train_dataset = Dataset.from_dict({'text': X_train, 'label': y_train})
test_dataset = Dataset.from_dict({'text': X_test, 'label': y_test})

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Cell 9: Model Training
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir='./logs',
    learning_rate=2e-5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_f1",
    report_to="none",
    fp16=True,  # Use mixed precision if GPU supports it
)

from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {
        'accuracy': (preds == labels).mean(),
        'f1': f1_score(labels, preds),
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

# Cell 10: Evaluation for Transformer
results = trainer.evaluate()
print("Transformer Results ({}):".format(MODEL_NAME))
print(results)

Map:   0%|          | 0/28000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3211,0.314527,0.88175,0.880243
2,0.2873,0.323263,0.881917,0.882514


Transformer Results (distilbert-base-uncased):
{'eval_loss': 0.3232633173465729, 'eval_accuracy': 0.8819166666666667, 'eval_f1': 0.8825138877373352, 'eval_runtime': 515.3488, 'eval_samples_per_second': 23.285, 'eval_steps_per_second': 0.728, 'epoch': 2.0}


In [15]:
# Cell 11: Model Comparison Table
import pandas as pd

results_dict = {
    'Model': ['VADER', 'Logistic Regression', 'DeBERTa-v3-small'],
    'Accuracy': [
        accuracy_score(y_test, y_pred_vader),
        accuracy_score(y_test, y_pred_lr),
        results['eval_accuracy']
    ],
    'F1': [
        f1_score(y_test, y_pred_vader),
        f1_score(y_test, y_pred_lr),
        results['eval_f1']
    ]
}

results_df = pd.DataFrame(results_dict)
print(results_df)

                 Model  Accuracy        F1
0                VADER  0.754417  0.763008
1  Logistic Regression  0.871583  0.869771
2     DeBERTa-v3-small  0.881917  0.882514
