In [None]:
import json
import pickle
import os
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

nltk.download("punkt") # tokenizer
nltk.download("stopwords")

In [None]:
# preprocessing and labeling

# stop words in language: english
stop_words = set(stopwords.words("english"))

tokenizer = RegexpTokenizer(r"<[^>]+>|[A-Za-z]+")

def preprocessing(text):
    # convert emojis to text in <> brackets
    text = emoji.demojize(text, delimiters=("<", ">"))
    # convert every word to lowercase to avoid duplicates (Happy and happy should be same)
    text = text.lower()
    # tokenizing
    tokens = tokenizer.tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    return " ".join(tokens)

# vader
analyzer = SentimentIntensityAnalyzer()

# labeling the comments as 1 for positive and 0 for negative
def labeling(text):
    score = analyzer.polarity_scores(text)
    compound = score["compound"]
    return 1 if compound >= 0 else 0

In [None]:
# convert the json file to dataframe

INPUT_PATH = "filtered.jsonl"
texts, labels = [], []

with open(INPUT_PATH, "r", encoding="utf-8") as f:
    for entry in f:
        comment = json.loads(entry)
        unprocessed_text = comment.get("body","")
        texts.append(preprocessing(unprocessed_text))
        labels.append(labeling(unprocessed_text))
        
df = pd.DataFrame({"text": texts, "label": labels})

In [None]:
# 4. tf-idf vectorizing and train/test splitting
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["text"])
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# model training
model = LogisticRegression(solver="saga", max_iter=50, class_weight="balanced")
model.fit(X_train, y_train)

In [None]:
# evaluation
y_prediction = model.predict(X_test)
evaluation = classification_report(y_test, y_prediction, digits=4)
print(evaluation)

In [None]:
# save the model
OUTPUT_DIR = "models"
os.makedirs(OUTPUT_DIR, exist_ok=True)

with open(os.path.join(OUTPUT_DIR, "model_logistic_regression.pkl"), "wb") as model_file:
    pickle.dump(model, model_file)
with open(os.path.join(OUTPUT_DIR, "tfidf_vectorizer.pkl"), "wb") as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)