In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid
import itertools

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Unsupervised Model - NMF

# Part 1: Data Loading 

Read the CSV files into Pandas DataFrames so we can explore and process the data.

In [None]:
train = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Train.csv')
test = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Test.csv')
solution = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv')

# Part 2: Exploratory Data Analysis (EDA)

Understand what columns we have, the shape of the data, and what the labels look like. This helps guide all further steps like preprocessing and modeling.

**Step 2.1: Inspect the Data**

* train.shape shows (number of rows, number of columns) → how many samples and what features
* train.columns shows the names of the columns, so you know what to work with
* train.head() shows the first 5 rows of the training data

In [None]:
print("Shape of training data:", train.shape)
print("Columns in training data:", train.columns)
train.head()

**Step 2.2: Class Distribution Analysis**

Understand how many samples we have for each news category (label).
If some categories have much more data than others, this could affect how the model learns — this is called class imbalance.

* train['Category'] accesses the column of labels.
* .value_counts() counts how many times each unique label appears.

In [None]:
train['Category'].value_counts()

**Step 2.3: Visualize the class distribution**

A bar chart gives you an immediate visual of whether some categories dominate.
If one category appears way more often than others, your model might become biased toward it.



In [None]:
train['Category'].value_counts().plot(kind='bar', figsize=(8,5), title='Category Distribution')
plt.xlabel('Category')
plt.ylabel('Count')
plt.grid(axis='y')
plt.show()

# Part 3: Data Cleaning

Check if there are any duplicates and if they are duplicated across categories (i.e. having multiple categories) or just duplicated in a single category.

Then, remove the duplicates.

**Step 3.1: Find Pure Duplicates (Same Text + Same Category)**

Find how many rows are exact duplicates — same news text and same category.

* train.duplicated() returns a boolean Series: True for rows that are duplicates of previous rows.
* .sum() counts how many True values there are — total duplicate rows.

This helps us know if there are exact repeated examples that add no new info.

In [None]:
duplicate_texts = (train.duplicated(subset=["Text", "Category"], keep=False)).sum()
print(f"Total duplicate texts (same text + same category): {duplicate_texts}")

**Step 3.2: Remove Pure Duplicates**

If any were found, remove these pure duplicate rows from the DataFrame so each unique example only appears once.

In [None]:
train = train.drop_duplicates(subset=["Text", "Category"])
duplicate_texts = (train.duplicated(subset=["Text", "Category"], keep=False)).sum()
print(f"Total duplicate texts (same text + same category): {duplicate_texts}")

**Step 3.3: Find Conflict Duplicates (Same Text, Different Category)**

Identify news articles that appear more than once but with different categories, which might confuse the model.

In [None]:
duplicates_across_categories = train.groupby("Text")["Category"].nunique()
num_multi_category_texts = (duplicates_across_categories > 1).sum()
print(f"Total texts that appear in more than one category: {num_multi_category_texts}")

# Part 4: Feature Extraction & Modeling

**Step 4.1: Convert text data into numerical features**

Text cannot be directly used by machine learning models — we need to transform it into numbers.

We’ll start with a commonly used method: TF-IDF (Term Frequency–Inverse Document Frequency).

* TF tells how often a word appears in a document.

* IDF down-weights common words (like "the", "is", "and") that are less useful for classification.

It gives us a sparse numerical matrix, which we can feed into ML models.

parameters:
* stop_words='english' filters common english words
* max_df=0.95 sets the threshhold to if 95% of documents have this word, ignore it
* min_df sets the threshhold of if 2 or more rows have this word , ignore it

In [None]:
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
X_train = vectorizer.fit_transform(train['Text'])

**Step 4.2: Building and Training Models**



NMF is commonly used for topic modeling. It decomposes the document-word matrix into:

Document-topic matrix (how strongly each doc belongs to each topic)

Topic-word matrix (what words are important in each topic)

* NMF = Non-negative Matrix Factorization.

* n_components=5: tells NMF to find 5 topics/clusters. We choose 5 because the BBC dataset has 5 categories.

* .fit(X_train): learns the topics (hidden patterns) from the TF-IDF matrix.

In [None]:
model = NMF(n_components=5).fit(X_train)
y_pred_train = model.transform(X_train)
y_pred_train_as_int = y_pred_train.argmax(axis=1)

print(model)

In [None]:
def label_permute_compare(ytdf,yp,n=5):
    """
    ytdf: labels dataframe object
    yp: clustering label prediction output
    Returns permuted label order and accuracy. 
    Example output: (3, 4, 1, 2, 0), 0.74 
    """
    label_encoder = LabelEncoder()
    y_true = label_encoder.fit_transform(ytdf.values.ravel())
    
    yp = np.array(yp)
    
    best_acc = 0
    best_perm = None
    
    for perm in itertools.permutations(range(n)):
        mapped_yp = [perm[label] for label in yp]
        acc = accuracy_score(y_true, mapped_yp)
        if acc > best_acc:
            best_acc = acc
            best_perm = perm
    return best_perm, best_acc

Compare Predictions to True Labels

In [None]:
perm, acc = label_permute_compare(train[['Category']], y_pred_train_as_int, n=5)

print("Accuracy for the train data: ", acc)
print("Best permutation:", perm)

Display the Confusion Matrix for the train data 

In [None]:
label_encoder = LabelEncoder()
y_true_train = label_encoder.fit_transform(train['Category'].values.ravel())

pred_map = [perm[label] for label in y_pred_train_as_int]

cm = confusion_matrix(y_true_train, pred_map)
display = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=label_encoder.classes_)
fig, ax = plt.subplots(figsize=(8, 6))
display.plot(xticks_rotation=45, cmap="Reds", ax=ax)
plt.show()

# Part 5: Use the Trained Model to Predict the Test Data

**Step 5.1: Transform the Test Data**

In [None]:
X_test = vectorizer.transform(test['Text'])

**Step 5.2: Use NMF Model to Predict Topics on Test Data**

In [None]:
y_pred_test = model.transform(X_test)

**Step 5.3: Pick the Most Likely Topic per Article**

In [None]:
y_pred_test_as_int = y_pred_test.argmax(axis=1)

**Step 5.4: Map NMF Topic Numbers to Actual Category Names**

In [None]:
mapped_preds_test = [perm[label] for label in y_pred_test_as_int]
predicted_labels_test = label_encoder.classes_[mapped_preds_test]

Create the Submission File

In [None]:
#submission = pd.DataFrame({
#    'ArticleId': test['ArticleId'],
#    'Category': predicted_labels_test
#})
#submission.to_csv('submission.csv', index=False)

Unsupervised Model - NMF accuracy score: 0.92380

# Part 6: Changing Hyperparameter

We can use ParameterGrid to explore different parameter combinations and identify those that yield the highest training accuracy. From the results, we’ll select the top 5 configurations and evaluate their performance on the test set by manually submitting the predictions to see if they lead to improved test accuracy.

In [None]:
results = []

param_grid = {
    'init': ['random', 'nndsvd'],
    'solver': ['cd', 'mu'],
    'beta_loss': ['frobenius', 'kullback-leibler'],
    'alpha': [0, 0.05, 0.1],
}

for params in ParameterGrid(param_grid):
    
    model_grid = NMF(n_components=5, init=params['init'], solver=params['solver'], alpha_W=params['alpha'], alpha_H=params['alpha'], max_iter=500).fit(X_train)
    y_pred_grid = model_grid.transform(X_train)
    y_pred_as_int_grid = y_pred_grid.argmax(axis=1)

    perm_grid, acc_grid = label_permute_compare(train[['Category']], y_pred_as_int_grid, n=5)

    results.append({
        'init': params['init'],
        'solver': params['solver'],
        'beta_loss': params['beta_loss'],
        'accuracy': acc_grid,
        'test accuracy': None,
        'perm': perm_grid,
        'model': model_grid
    })

results_df = pd.DataFrame(results)

# Display the table
results_df.sort_values(by='accuracy', ascending=False)

In [None]:
results_df.loc[2, 'test accuracy'] = 0.92230
results_df.loc[6, 'test accuracy'] = 0.92230
results_df.loc[5, 'test accuracy'] = 0.90384
results_df.loc[0, 'test accuracy'] = 0.91092
results_df.loc[1, 'test accuracy'] = 0.67029

results_df.sort_values(by='test accuracy', ascending=False).head(5)

Based on the table above, the best is nndsvd, with cd, alpha_H 0, and alpha_W 0 with a test score of 0.9223. 

The result is not better than the original setting accuract score 0.92380.

# Supervised Model - Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression


# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['Category'])

# Train logistic regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict on test data
y_test_pred = model.predict(X_test)
predicted_labels = label_encoder.inverse_transform(y_test_pred)

# Create submission file
# submission = pd.DataFrame({
#    'ArticleId': test['ArticleId'],
#    'Category': predicted_labels
#})

#submission.to_csv('logreg_submission.csv', index=False)

Supervised Model - Logistic Regression accuracy score: 0.98231

In [None]:
# os.remove("/kaggle/working/svm_submission.csv")

In [64]:
from sklearn.svm import SVC

# Encode labels
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train['Category'])

# Train SVM classifier
model = SVC(kernel='linear', probability=True)  
model.fit(X_train, y_train)

# Predict on test data
y_test_pred = model.predict(X_test)

# Convert numeric predictions back to original category labels
predicted_labels = label_encoder.inverse_transform(y_test_pred)

# Create submission DataFrame
#submission = pd.DataFrame({
#    'ArticleId': test['ArticleId'],
#    'Category': predicted_labels
#})

# Save to CSV
#submission.to_csv('svm_submission.csv', index=False)

Supervised Model - SVM classifier accuracy score: 0.98095

Finally, to evaluate how supervised learning algorithms perform when the training data is randomly split. We'll use train_test_split to divide the training set based on a specified fraction. Each model will be trained on this split subset and then used to make predictions on the full original training set to assess accuracy.

In [66]:
from sklearn.model_selection import train_test_split

for frac in [0.1, 0.2, 0.5, 0.9]:
    X_train_frac, _, y_train_frac, _ = train_test_split(X_train, y_train, train_size=frac)

    model_log = LogisticRegression(max_iter=1000)
    model_log.fit(X_train_frac, y_train_frac)
    y_pred_log = model_log.predict(X_train)
    perm_log, acc_log = label_permute_compare(train[['Category']], y_pred_log, n=5)
    print(f"LogReg ({int(frac*100)}% train): Accuracy = {acc_log:.3f}")

    model_svm = SVC()
    model_svm.fit(X_train_frac, y_train_frac)
    y_pred_svm = model_svm.predict(X_train)
    perm_svm, acc_svm = label_permute_compare(train[['Category']], y_pred_svm, n=5)
    print(f"SVM    ({int(frac*100)}% train): Accuracy = {acc_svm:.3f}")
    

LogReg (10% train): Accuracy = 0.777
SVM    (10% train): Accuracy = 0.679
LogReg (20% train): Accuracy = 0.894
SVM    (20% train): Accuracy = 0.860
LogReg (50% train): Accuracy = 0.978
SVM    (50% train): Accuracy = 0.974
LogReg (90% train): Accuracy = 0.994
SVM    (90% train): Accuracy = 0.996


We observe that using smaller subsets of data generally results in lower accuracy. However, SVM maintains consistently high accuracy across different data proportions. In contrast, Logistic Regression needs at least 20% of the data to achieve accuracy in the high 0.9x range. This suggests that supervised learning models, particularly SVM, have the potential to perform well even with reduced training data.

# Summary: Comparison between Unsupervised and Supervised model



**Ranking from highest accuracy score:**

1. Supervised Model - Logistic Regression accuracy score: 0.98231
2. Supervised Model - SVM classifier accuracy score: 0.98095
3. Unsupervised Model - NMF accuracy score: 0.92380

**Conclusion:**

The supervised models, Logistic Regression and SVM, both achieved very high accuracy scores above 98%, demonstrating their strong performance in classifying the labeled data. Logistic Regression slightly outperformed SVM by a small margin. In contrast, the unsupervised model, NMF, showed a lower accuracy around 92%, which is expected since it does not use label information during training. 

Overall, these results highlight the advantage of supervised learning methods when labeled data is available, while unsupervised methods like NMF can still provide useful, though less accurate, topic grouping without supervision.