In [19]:
# BBC News Classification Kaggle Mini-Project
## 1. Exploratory Data Analysis
# Analyze the dataset to understand category distribution, text length, and common words.

import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
import nltk
from nltk.tokenize import word_tokenize
import os

# Add local nltk_data to NLTK path
nltk.data.path.append(os.path.join(os.getcwd(), 'nltk_data'))

# Function to find datasets
def find_dataset(filename):
    possible_paths = [
        os.path.join('data', filename),  # data subfolder (primary)
        filename,  # Current directory
        os.path.join('datasets', filename)  # datasets subfolder
    ]
    for path in possible_paths:
        if os.path.exists(path):
            return path
    raise FileNotFoundError(f"Dataset '{filename}' not found in {os.getcwd()}. Checked paths: {possible_paths}. Please place the file in the 'data' folder.")

# Load the dataset
try:
    dataset_path = find_dataset('BBC News Train.csv')
    train_df = pd.read_csv(dataset_path)
except FileNotFoundError as e:
    print(e)
    print("Please ensure 'BBC News Train.csv' is in the 'data' folder.")
    exit(1)

# Basic info
print('Dataset Info:')
print(train_df.info())
print('\nMissing Values:')
print(train_df.isnull().sum())

# Category distribution
print('\nCategory Distribution:')
category_counts = train_df['Category'].value_counts()
print(category_counts)
plt.figure(figsize=(8, 6))
plt.bar(category_counts.index, category_counts.values)
plt.title('Distribution of Article Categories')
plt.xlabel('Category')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.savefig('category_distribution.png')
plt.close()

# Article length analysis
try:
    train_df['text_length'] = train_df['Text'].apply(lambda x: len(word_tokenize(x)))
    print('\nText Length Statistics:')
    print(train_df['text_length'].describe())
    plt.figure(figsize=(8, 6))
    plt.hist(train_df['text_length'], bins=50, edgecolor='black')
    plt.title('Distribution of Article Text Lengths')
    plt.xlabel('Number of Words')
    plt.ylabel('Frequency')
    plt.savefig('text_length_distribution.png')
    plt.close()
except LookupError as e:
    print(f'Error in text length analysis (NLTK punkt missing): {e}')
    print("Please ensure 'english.pickle' is in nltk_data/tokenizers/punkt/PY3/. Download from: https://github.com/nltk/nltk_data/raw/gh-pages/packages/tokenizers/punkt.zip")
    print("Verify folder structure with:")
    print("import os")
    print("print(os.listdir('nltk_data/tokenizers/punkt/PY3'))")
    exit(1)

# Most common words
try:
    all_text = ' '.join(train_df['Text'].values)
    tokens = word_tokenize(all_text.lower())
    word_counts = Counter(tokens)
    common_words = word_counts.most_common(20)
    words, counts = zip(*common_words)
    plt.figure(figsize=(10, 6))
    plt.barh(words, counts)
    plt.title('Top 20 Most Common Words')
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.savefig('common_words.png')
    plt.close()
except LookupError as e:
    print(f'Error in common words analysis (NLTK punkt missing): {e}')
    print("Please ensure 'english.pickle' is in nltk_data/tokenizers/punkt/PY3/. Download from: https://github.com/nltk/nltk_data/raw/gh-pages/packages/tokenizers/punkt.zip")
    print("Verify folder structure with:")
    print("import os")
    print("print(os.listdir('nltk_data/tokenizers/punkt/PY3'))")
    exit(1)

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 23.4+ KB
None

Missing Values:
ArticleId    0
Text         0
Category     0
dtype: int64

Category Distribution:
Category
sport            346
business         336
politics         274
entertainment    273
tech             261
Name: count, dtype: int64

Text Length Statistics:
count    1490.000000
mean      406.687248
std       221.932994
min        95.000000
25%       266.000000
50%       356.500000
75%       494.000000
max      3496.000000
Name: text_length, dtype: float64


In [20]:
## 2. Supervised Learning
# Preprocess text, convert to TF-IDF features, and train Logistic Regression and Naive Bayes classifiers.

import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Add local nltk_data to NLTK path
nltk.data.path.append(os.path.join(os.getcwd(), 'nltk_data'))

# Text preprocessing
try:
    stop_words = set(stopwords.words('english'))
except LookupError:
    print("Error: NLTK 'stopwords' not found. Download from: https://github.com/nltk/nltk_data/raw/gh-pages/packages/corpora/stopwords.zip and place in nltk_data/corpora/stopwords/")
    exit(1)

try:
    lemmatizer = WordNetLemmatizer()
except LookupError:
    print("Error: NLTK 'wordnet' not found. Download from: https://github.com/nltk/nltk_data/raw/gh-pages/packages/corpora/wordnet.zip and place in nltk_data/corpora/wordnet/")
    exit(1)

def preprocess_text(text):
    try:
        tokens = word_tokenize(text.lower())
        tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        return ' '.join(tokens)
    except LookupError as e:
        print(f'Error in text preprocessing (NLTK punkt missing): {e}')
        print("Please ensure 'english.pickle' is in nltk_data/tokenizers/punkt/PY3/")
        exit(1)

# Apply preprocessing
try:
    train_df['processed_text'] = train_df['Text'].apply(preprocess_text)
except NameError:
    print("Error: 'train_df' not defined. Please run the EDA section first to load 'BBC News Train.csv'.")
    exit(1)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = tfidf.fit_transform(train_df['processed_text'])
y = train_df['Category']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Logistic Regression
lr_model = LogisticRegression(max_iter=1000, multi_class='multinomial')
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_val)
print('Logistic Regression Results:')
print(f'Accuracy: {accuracy_score(y_val, lr_pred):.4f}')
print(classification_report(y_val, lr_pred))

# Train Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_val)
print('\nNaive Bayes Results:')
print(f'Accuracy: {accuracy_score(y_val, nb_pred):.4f}')
print(classification_report(y_val, nb_pred))

# Save models and vectorizer
try:
    with open('tfidf_vectorizer.pkl', 'wb') as f:
        pickle.dump(tfidf, f)
    with open('lr_model.pkl', 'wb') as f:
        pickle.dump(lr_model, f)
    with open('nb_model.pkl', 'wb') as f:
        pickle.dump(nb_model, f)
    print("Saved: tfidf_vectorizer.pkl, lr_model.pkl, nb_model.pkl")
except Exception as e:
    print(f"Error saving models: {e}")



Logistic Regression Results:
Accuracy: 0.9732
               precision    recall  f1-score   support

     business       0.97      0.97      0.97        75
entertainment       0.98      1.00      0.99        46
     politics       0.96      0.95      0.95        56
        sport       0.97      1.00      0.98        63
         tech       0.98      0.95      0.96        58

     accuracy                           0.97       298
    macro avg       0.97      0.97      0.97       298
 weighted avg       0.97      0.97      0.97       298


Naive Bayes Results:
Accuracy: 0.9765
               precision    recall  f1-score   support

     business       0.97      0.97      0.97        75
entertainment       1.00      1.00      1.00        46
     politics       0.95      0.95      0.95        56
        sport       0.98      1.00      0.99        63
         tech       0.98      0.97      0.97        58

     accuracy                           0.98       298
    macro avg       0.98      

In [21]:
## 3. Unsupervised Learning (Topic Modeling)
# Apply NMF to discover latent topics in the articles.

import numpy as np
from sklearn.decomposition import NMF

# Assume X and tfidf from Supervised Learning
try:
    # Apply NMF
    n_topics = 5  # Matches number of categories
    nmf = NMF(n_components=n_topics, random_state=42)
    W = nmf.fit_transform(X)  # Document-topic matrix
    H = nmf.components_      # Topic-term matrix

    # Get top words per topic
    feature_names = tfidf.get_feature_names_out()
    for topic_idx, topic in enumerate(H):
        top_words = [feature_names[i] for i in topic.argsort()[-10:]]
        print(f'Topic {topic_idx + 1}: {", ".join(top_words)}')

    # Assign dominant topic to each article
    train_df['dominant_topic'] = np.argmax(W, axis=1)
    print('\nArticles per Topic:')
    print(train_df['dominant_topic'].value_counts())

    # Save topic assignments
    try:
        train_df[['ArticleId', 'Category', 'dominant_topic']].to_csv('topic_assignments.csv', index=False)
        print("Saved: topic_assignments.csv")
    except Exception as e:
        print(f"Error saving topic assignments: {e}")
except NameError:
    print("Error: 'X', 'tfidf', or 'train_df' not defined. Please run the Supervised Learning section first.")
    exit(1)

Topic 1: director, nomination, actress, festival, star, oscar, actor, best, award, film
Topic 2: tax, mr blair, said, tory, brown, party, blair, election, labour, mr
Topic 3: gadget, digital, user, said, service, music, technology, people, phone, mobile
Topic 4: side, said, cup, ireland, wale, match, player, win, england, game
Topic 5: economic, price, sale, bank, year, market, rate, economy, said, growth

Articles per Topic:
dominant_topic
3    399
4    340
2    295
1    266
0    190
Name: count, dtype: int64
Saved: topic_assignments.csv


In [22]:
## 4. Kaggle Submission
# Generate predictions for the test set using the trained model.

import pandas as pd
import pickle
import os

# Add local nltk_data to NLTK path
nltk.data.path.append(os.path.join(os.getcwd(), 'nltk_data'))

# Function to find datasets
def find_dataset(filename):
    possible_paths = [
        os.path.join('data', filename),
        filename,
        os.path.join('datasets', filename)
    ]
    for path in possible_paths:
        if os.path.exists(path):
            return path
    raise FileNotFoundError(f"Dataset '{filename}' not found in {os.getcwd()}. Checked paths: {possible_paths}. Please place the file in the 'data' folder.")

# Load test data
try:
    test_dataset_path = find_dataset('BBC News Test.csv')
    test_df = pd.read_csv(test_dataset_path)
except FileNotFoundError as e:
    print(e)
    print("Please ensure 'BBC News Test.csv' is in the 'data' folder.")
    exit(1)

# Preprocess test data (reuse preprocess_text from Supervised Learning)
def preprocess_text(text):
    try:
        from nltk.corpus import stopwords
        from nltk.stem import WordNetLemmatizer
        from nltk.tokenize import word_tokenize
        stop_words = set(stopwords.words('english'))
        lemmatizer = WordNetLemmatizer()
        tokens = word_tokenize(text.lower())
        tokens = [t for t in tokens if t.isalpha() and t not in stop_words]
        tokens = [lemmatizer.lemmatize(t) for t in tokens]
        return ' '.join(tokens)
    except LookupError as e:
        print(f'Error in text preprocessing: {e}')
        print("Ensure 'punkt', 'stopwords', and 'wordnet' are in nltk_data/")
        exit(1)

test_df['processed_text'] = test_df['Text'].apply(preprocess_text)

# Load vectorizer and model
try:
    with open('tfidf_vectorizer.pkl', 'rb') as f:
        tfidf = pickle.load(f)
    with open('lr_model.pkl', 'rb') as f:
        lr_model = pickle.load(f)
except FileNotFoundError as e:
    print(f'Error: Model or vectorizer file not found: {e}')
    print("Please run the Supervised Learning section to generate 'tfidf_vectorizer.pkl' and 'lr_model.pkl'.")
    exit(1)

# Transform test data
X_test = tfidf.transform(test_df['processed_text'])

# Predict
predictions = lr_model.predict(X_test)

# Create submission file
try:
    submission_df = pd.DataFrame({'ArticleId': test_df['ArticleId'], 'Category': predictions})
    submission_df.to_csv('submission.csv', index=False)
    print('Submission file created: submission.csv')
except Exception as e:
    print(f"Error saving submission file: {e}")

Submission file created: submission.csv


In [23]:
## 5. Comparison of Supervised and Unsupervised Learning
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Load topic assignments
try:
    topic_df = pd.read_csv('topic_assignments.csv')
except FileNotFoundError:
    print("Error: 'topic_assignments.csv' not found. Please run Unsupervised Learning section.")
    exit(1)

# Map NMF topics to categories (based on Section 3 output)
topic_to_category = {
    0: 'entertainment',
    1: 'politics',
    2: 'tech',
    3: 'sport',
    4: 'business'
}

# Assign predicted categories from NMF
topic_df['nmf_predicted_category'] = topic_df['dominant_topic'].map(topic_to_category)

# Calculate NMF accuracy
nmf_accuracy = (topic_df['nmf_predicted_category'] == topic_df['Category']).mean()
print(f'NMF Topic Modeling Accuracy: {nmf_accuracy:.4f}')

# Confusion matrix for NMF
labels = ['business', 'entertainment', 'politics', 'sport', 'tech']
cm_nmf = confusion_matrix(topic_df['Category'], topic_df['nmf_predicted_category'], labels=labels)

# Print confusion matrix with category labels as comments
print('\nNMF Confusion Matrix:')
for i, row in enumerate(cm_nmf):
    row_str = ' [' + ' '.join(f'{x:>3}' for x in row) + ']'
    print(f'{row_str}  # {labels[i].capitalize()}')

# Plot confusion matrix
plt.figure(figsize=(8, 6))
plt.imshow(cm_nmf, interpolation='nearest', cmap='Blues')
plt.title('NMF Confusion Matrix')
plt.colorbar()
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks, labels)
for i in range(len(labels)):
    for j in range(len(labels)):
        plt.text(j, i, cm_nmf[i, j], ha='center', va='center')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('nmf_confusion_matrix.png')
plt.close()

# Comparison analysis
print("""
### Comparison Analysis
- **Supervised Learning (Logistic Regression, Naive Bayes)**:
  - **Accuracy**: Logistic Regression (97.32%), Naive Bayes (97.65%) on validation set.
  - **Strengths**: High accuracy, direct category prediction, robust to text variations.
  - **Weaknesses**: Requires labeled data, less interpretable feature importance.
- **Unsupervised Learning (NMF Topic Modeling)**:
  - **Accuracy**: ~{:.4f} (based on topic-to-category mapping).
  - **Strengths**: Discovers latent topics without labels, interpretable topic words (e.g., 'film' for entertainment).
  - **Weaknesses**: Lower accuracy, topic-to-category mapping is subjective, uneven topic distribution.
- **Key Differences**:
  - Supervised models excel in classification tasks with labeled data.
  - NMF is better for exploratory analysis, identifying themes without supervision.
- **Conclusion**: Supervised learning outperforms NMF for classification due to direct label prediction, but NMF provides valuable insights into article themes.
""".format(nmf_accuracy))

NMF Topic Modeling Accuracy: 0.8906

NMF Confusion Matrix:
 [312   0  11   3  10]  # Business
 [  7 187   7  29  43]  # Entertainment
 [ 17   0 245   8   4]  # Politics
 [  0   1   0 345   0]  # Sport
 [  4   2   3  14 238]  # Tech

### Comparison Analysis
- **Supervised Learning (Logistic Regression, Naive Bayes)**:
  - **Accuracy**: Logistic Regression (97.32%), Naive Bayes (97.65%) on validation set.
  - **Strengths**: High accuracy, direct category prediction, robust to text variations.
  - **Weaknesses**: Requires labeled data, less interpretable feature importance.
- **Unsupervised Learning (NMF Topic Modeling)**:
  - **Accuracy**: ~0.8906 (based on topic-to-category mapping).
  - **Strengths**: Discovers latent topics without labels, interpretable topic words (e.g., 'film' for entertainment).
  - **Weaknesses**: Lower accuracy, topic-to-category mapping is subjective, uneven topic distribution.
- **Key Differences**:
  - Supervised models excel in classification tasks with label

In [24]:
## 6. Local Evaluation
# Compare submission.csv with BBC News Sample Solution.csv to estimate performance.

import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
import os

# Function to find datasets
def find_dataset(filename):
    possible_paths = [
        os.path.join('data', filename),
        filename,
        os.path.join('datasets', filename)
    ]
    for path in possible_paths:
        if os.path.exists(path):
            return path
    raise FileNotFoundError(f"Dataset '{filename}' not found in {os.getcwd()}.")

# Load submission and sample solution
try:
    submission_df = pd.read_csv('submission.csv')
    sample_solution_path = find_dataset('BBC News Sample Solution.csv')
    sample_solution_df = pd.read_csv(sample_solution_path)
except FileNotFoundError as e:
    print(f'Error: {e}')
    print("Please ensure 'submission.csv' and 'BBC News Sample Solution.csv' are available.")
    exit(1)

# Merge and compare predictions
merged_df = submission_df.merge(sample_solution_df, on='ArticleId', suffixes=('_pred', '_true'))
accuracy = (merged_df['Category_pred'] == merged_df['Category_true']).mean()
print(f'Local Evaluation Accuracy: {accuracy:.4f}')

# Detailed classification report
print('\nClassification Report:')
print(classification_report(merged_df['Category_true'], merged_df['Category_pred']))

# Save evaluation results
try:
    merged_df.to_csv('submission_evaluation.csv', index=False)
    print("Saved: submission_evaluation.csv")
except Exception as e:
    print(f"Error saving evaluation: {e}")

Local Evaluation Accuracy: 0.1918

Classification Report:
               precision    recall  f1-score   support

     business       0.20      0.24      0.22       147
entertainment       0.21      0.16      0.18       147
     politics       0.16      0.16      0.16       147
        sport       0.20      0.23      0.22       147
         tech       0.18      0.16      0.17       147

     accuracy                           0.19       735
    macro avg       0.19      0.19      0.19       735
 weighted avg       0.19      0.19      0.19       735

Saved: submission_evaluation.csv
