In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
gpus=tf.config.list_physical_devices("GPU")
print(gpus)


In [None]:
for gpu in gpus :
  try:
    tf.config.experimental.set_memory_growth(gpu, True)
  except Exception as e :
    print("error locating GPU", e)

In [None]:
df = pd.read_csv("topics.csv")
df=pd.read_csv("/content/topics.csv")
df.head(3)

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df["text"]=df["text"].astype(str).str.strip()
df["label"]=df["label"].astype(str).str.strip()

df=df[(df["text"] != "" ) & (df["label"] != "")]

In [None]:
print("labels unique:", df["label"].nunique())

In [None]:
plt.figure (figsize=(8,3))
df["label"].value_counts().plot(kind="bar")
plt.title("labels distribution")
plt.show()

In [None]:
count=df["label"].value_counts()
df = df[df["label"].isin(count[count > 20].index)]

In [None]:
import re

In [None]:
URL_EMAIL_RE = re.compile(r"(https?://\S+|www\.\S+|\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.\w+)")
EXTRA_SPACE_RE = re.compile(r"\s+")

In [None]:
def clean_text (s :str) -> str:
  s=s.lower()
  s=URL_EMAIL_RE.sub(" ", s)
  s=EXTRA_SPACE_RE.sub(" ", s)
  return s

print("After cleaning:", df.shape)

In [None]:
df["text"]=df["text"].apply(clean_text)

In [None]:
mapping = {
    # Sports
    "rec.sport.hockey": "Sports",
    "rec.sport.baseball": "Sports",
    "rec.motorcycles": "Sports",
    "rec.autos": "Sports",

    # Religion
    "soc.religion.christian": "Religion",
    "talk.religion.misc": "Religion",
    "alt.atheism": "Religion",

    # Computer - Software
    "comp.os.ms-windows.misc": "Computer-Software",
    "comp.windows.x": "Computer-Software",
    "comp.graphics": "Computer-Software",

    # Computer - Hardware
    "comp.sys.ibm.pc.hardware": "Computer-Hardware",
    "comp.sys.mac.hardware": "Computer-Hardware",

    # Science
    "sci.crypt": "Science",
    "sci.med": "Science",
    "sci.space": "Science",
    "sci.electronics": "Science",

    # Politics
    "talk.politics.mideast": "Politics",
    "talk.politics.guns": "Politics",
    "talk.politics.misc": "Politics",

    # Misc
    "misc.forsale": "Miscellaneous"
}

In [None]:
df["new_label"] = df["label"].map(lambda x: mapping.get(x, "Uncategorized"))

result = df[["text", "new_label"]].copy()

In [None]:
result.head(5)

In [None]:
print("labels unique:", df["label"].nunique())

In [None]:
X = result["text"].values
y = result["new_label"].values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

In [None]:
print(X_val.shape)
print(y_val.shape)

In [None]:
tfidf =TfidfVectorizer(
     max_features=200_000,
     ngram_range=(1,3),
     min_df=2,
     sublinear_tf=True,
     norm="l2"


)

In [None]:
X_train_final =tfidf.fit_transform(X_train)
X_val_final =tfidf.transform(X_val)
X_test_final =tfidf.transform(X_test)

In [None]:
print("TF-IDF shapes:", X_train.shape, X_val.shape, X_test.shape)


In [None]:
print(y_val.shape)

In [None]:
print(X_test.shape)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

le.fit(list(y_train) + list(y_val) + list(y_test))

y_train_encoded = le.transform(y_train)
y_val_encoded = le.transform(y_val)
y_test_encoded = le.transform(y_test)

In [None]:
input_dim = X_train_final.shape[1]


In [None]:
strategy =tf.distribute.MirroredStrategy()

In [None]:
with strategy.scope():
    def build_ann(input_dim: int, num_classes: int) -> keras.Model:
        inputs = keras.Input(shape=(input_dim,), name="tfidf_or_svd_input")
        x = layers.Dense(512, activation="relu")(inputs)
        x = layers.Dropout(0.4)(x)
        x = layers.Dense(256, activation="relu")(x)
        x = layers.Dropout(0.3)(x)
        x = layers.Dense(128, activation="relu")(x)
        # IMPORTANT: keep output in float32 when using mixed precision
        outputs = layers.Dense(num_classes, activation="softmax", dtype="float32")(x)

        model = keras.Model(inputs, outputs, name="topics_ann")
        opt = keras.optimizers.Adam(learning_rate=1e-3)  # loss scaling handled automatically
        model.compile(optimizer=opt,
                      loss="sparse_categorical_crossentropy",
                      metrics=["accuracy"])
        return model

    model = build_ann(input_dim, num_classes=7)

In [None]:
model.summary()

In [None]:

history = model.fit(
    X_train_final, y_train_encoded,
    validation_data=(X_val_final, y_val_encoded),
    epochs=5,
    batch_size=64,
    verbose=1,

)

In [None]:
from sklearn.metrics import classification_report ,accuracy_score

In [None]:
y_pred =model.predict(X_test_final)
y_pred_class =np.argmax(y_pred, axis=1)

In [None]:
classification_report(y_test_encoded,y_pred_class)

In [None]:
accuracy_score(y_test_encoded,y_pred_class)