# BBC News Classification

Dataset Used - [BBC News Classification (Kaggle)](https://www.kaggle.com/competitions/learn-ai-bbc/data)

In [1]:
import re

import numpy as np
import spacy
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# !python -m spacy download en_core_web_md
nlp = spacy.load("en_core_web_md")

## Data Cleaning

In [None]:
df = pd.read_csv("../dataset/mtsamples.csv")
df.head()

In [None]:
print("Rows:", len(df))
print("Unique Labels:", len(df["medical_specialty"].unique()))

In [None]:
df.description = df.description + " " + df.transcription
df.medical_specialty = df.medical_specialty.str.strip()
df = df.drop(columns=[df.columns[0], "sample_name", "transcription","keywords"])
df.head()

In [None]:
na_rows = df.isna().any(axis=1)
print("Null values:", na_rows.sum())
df = df.dropna()
print("Rows after removing null:", len(df))

In [None]:
df.to_csv("../dataset/cleaned.csv", index=False)

## Data preprocessing

In [None]:
def preprocess(text):
    text = text.replace(".,", " ")
    doc = nlp(text)
    processed_text = []
    for token in doc:
        if not (token.is_stop or token.is_punct or token.like_num):
            processed_text.append(token.lemma_)
    text = " ".join(processed_text)
    return re.sub(" +", " ", text)

df.description = df.description.apply(preprocess)
df.head()

In [None]:
df.to_csv("../dataset/preprocessed.csv", index=False)

## Feature Engineering

In [None]:
df = pd.read_csv("../dataset/preprocessed.csv")
df.head()

In [None]:
df["vectors"] = df.description.apply(lambda x: nlp(x).vector)
df.head()

In [None]:
label_map = df.medical_specialty.unique()
np.savetxt("../dataset/label_map.csv", label_map, fmt="%s", delimiter=",")
label_map = label_map.tolist()

df["labels"] = df.medical_specialty.apply(lambda x: label_map.index(x))
print(label_map)
df.head()

In [None]:
X = np.stack(df.vectors.to_numpy())
Y = np.stack(df.labels.to_numpy())
np.savetxt("../dataset/X.csv", X, delimiter=",")
np.savetxt("../dataset/Y.csv", Y, delimiter=",")

## Exploratory Data Analysis

In [None]:
df = pd.read_csv("../dataset/preprocessed.csv")
label_map = np.loadtxt("../dataset/label_map.csv", delimiter=",", dtype=str)
df.head()

In [None]:
# Checking class imbalance

label_counts = []
for s in label_map: label_counts.append(len(df[df.medical_specialty==s]))
print({x: y for x, y in zip(label_map, label_counts)})

plt.figure(figsize=(9,9))
plt.pie(label_counts, labels=label_map, explode=[0.1 for _ in range(40)], autopct="%.1f%%")
plt.title("Class Distribution")
plt.show()

## Model Training

- Classes are imbalanced, check if we can improve by balancing

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [None]:
X = np.loadtxt("../dataset/X.csv", delimiter=",")
Y = np.loadtxt("../dataset/Y.csv", delimiter=",")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2385)
print("Training Size:", X_train.shape)
print("Test Size:", X_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)

In [None]:
models = {"KNN": {"model": KNeighborsClassifier(),
                  "params": {"n_neighbors": [5, 10, 20, 100], 
                             "weights": ["uniform", "distance"]}},
          
          "Random Forest": {"model": RandomForestClassifier(random_state=2806),
                            "params": {"n_estimators": [10, 100],
                            "max_depth": [100, 200, None],
                            }},
          
          "SVM": {"model": SVC(random_state=3483),
                  "params": {"C": [0.001, 0.01, 0.1, 1],
                             "max_iter": [100, 500, 1000, 10000, -1]}}}

In [None]:
def GridSearch(models):
    scores = {"model":[], "best score": [], "best params": []}
    for name, m in models.items():
        gscv = GridSearchCV(m["model"], m["params"], verbose=2, n_jobs=-1)
        gscv.fit(X_train, Y_train)
        scores["model"].append(name)
        scores["best score"].append(gscv.best_score_)
        scores["best params"].append(gscv.best_params_)
    return scores

model_scores = GridSearch(models)
    

In [None]:
pd.DataFrame(model_scores)