In [None]:
from google.colab import files
files.upload()


## 1. INSTALL (only needed in Colab)

In [None]:
# Uncomment if running in Colab
!pip install sentence-transformers

## 2. IMPORTS

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

from sentence_transformers import SentenceTransformer

## 3. LOAD DATA

### Expected files in the same folder:
*   `training_data_lowercase.csv`
*   `testing_data_lowercase_nolabs.csv`

**Please ensure these files are uploaded to your Colab environment before running this cell.**

In [None]:
print("Loading data...")

# Read the CSV with tab separator and no header
df = pd.read_csv("training_data_lowercase.csv", sep='\t', header=None)

# Assign meaningful column names based on the file's structure
df.columns = ["label", "text"]

print("Shape:", df.shape)
print(df.head())

TEXT_COL = "text"
LABEL_COL = "label"

X = df[TEXT_COL]
y = df[LABEL_COL]

## 4. TRAIN / TEST SPLIT

In [None]:
print("\nSplitting dataset...")

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

## 5. MODEL 1 — TFIDF + LOGISTIC REGRESSION

In [None]:
print("\n==============================")
print("MODEL 1: TFIDF + Logistic")
print("==============================")

vectorizer = TfidfVectorizer(max_features=20000)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec  = vectorizer.transform(X_test)

model1 = LogisticRegression(max_iter=1000)
model1.fit(X_train_vec, y_train)

pred1 = model1.predict(X_test_vec)

acc1 = accuracy_score(y_test, pred1)

print("Accuracy:", acc1)
print(classification_report(y_test, pred1))

In [None]:
text = ["This article is totally fake and shocking"]

x = vectorizer.transform(text)
pred = model1.predict(x)

print(pred)


## 6. MODEL 2 — EMBEDDINGS + LOGISTIC REGRESSION

In [None]:
print("\n==============================")
print("MODEL 2: Embeddings + Logistic")
print("==============================")

embedder = SentenceTransformer("all-MiniLM-L6-v2")

X_train_emb = embedder.encode(X_train.tolist())
X_test_emb  = embedder.encode(X_test.tolist())

model2 = LogisticRegression(max_iter=1000)
model2.fit(X_train_emb, y_train)

pred2 = model2.predict(X_test_emb)

acc2 = accuracy_score(y_test, pred2)

print("Accuracy:", acc2)
print(classification_report(y_test, pred2))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# ===== CONFUSION MATRICES =====
fig, ax = plt.subplots(1, 2, figsize=(10,4))

cm1 = confusion_matrix(y_test, pred1)
ConfusionMatrixDisplay(cm1).plot(ax=ax[0], colorbar=False)
ax[0].set_title("TF-IDF")

cm2 = confusion_matrix(y_test, pred2)
ConfusionMatrixDisplay(cm2).plot(ax=ax[1], colorbar=False)
ax[1].set_title("Embeddings")

plt.tight_layout()
plt.show()


# ===== ACCURACY BAR PLOT =====
plt.figure(figsize=(5,4))
plt.bar(["TF-IDF", "Embeddings"], [acc1, acc2])
plt.ylim(0,1)
plt.ylabel("Accuracy")
plt.title("Model Comparison")
plt.show()


## 7. FINAL COMPARISON

In [None]:
print("\n==============================")
print("FINAL RESULTS")
print("==============================")

print("TFIDF accuracy:      ", acc1)
print("Embeddings accuracy: ", acc2)


if acc1 > acc2:
    print("Best model: TFIDF")
else:
    print("Best model: Embeddings")


print("\nDone.")

In [None]:
from pathlib import Path
import joblib

# create folder to store models
models_dir = Path("models")
models_dir.mkdir(exist_ok=True)

# save TF-IDF model
joblib.dump(model1, models_dir / "tfidf_model.joblib")

# save Embeddings classifier
joblib.dump(model2, models_dir / "embeddings_model.joblib")

# save sentence embedder (needed to encode new text)
joblib.dump(embedder, models_dir / "sentence_embedder.joblib")

print("All models saved successfully inside ./models/")
