In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Load datasets
train_df = pd.read_csv("/vol/bitbucket/lf524/nlp_cw/Data/train.csv")
dev_df = pd.read_csv("/vol/bitbucket/lf524/nlp_cw/Data/dev.csv")

# Ensure text columns are strings
train_df["text"] = train_df["text"].astype(str)
dev_df["text"] = dev_df["text"].astype(str)

# Downsample negative instances in the training set
pcldf_train = train_df[train_df.label == 1]  # Positive class (1s)
npos_train = len(pcldf_train)

training_set = pd.concat([pcldf_train, train_df[train_df.label == 0][:npos_train * 2]])
training_set = training_set.sample(frac=1, random_state=42).reset_index(drop=True)

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words="english", ngram_range=(1,2))  # Bi-grams included
X_train = vectorizer.fit_transform(training_set["text"])
X_dev = vectorizer.transform(dev_df["text"])

# Logistic Regression Model
clf = LogisticRegression(max_iter=500)
clf.fit(X_train, training_set["label"])

# Make Predictions
y_pred = clf.predict(X_dev)

In [15]:
# Compute F1 Score
f1 = f1_score(dev_df["label"], y_pred)

# Print results
print(f"TF-IDF + Logistic Regression Dev F1 Score: {f1:.4f}")

# Save predictions
dev_df["predicted_label"] = y_pred
dev_df[["par_id", "predicted_label"]].to_csv("task1_tfidf_lr.txt", index=False)

print("Predictions saved in task1_tfidf_lr.txt")

TF-IDF + Logistic Regression Dev F1 Score: 0.3189
Predictions saved in task1_tfidf_lr.txt
