## Collecting Data

In [1]:
# Inspecting BBC news dataset (downloaded from kaggle)
import pandas as pd

df = pd.read_csv("bbc-news-data.csv", sep="\t")

print(df.head())
print(df.columns)

  from pandas.core import (


   category filename                              title  \
0  business  001.txt  Ad sales boost Time Warner profit   
1  business  002.txt   Dollar gains on Greenspan speech   
2  business  003.txt  Yukos unit buyer faces loan claim   
3  business  004.txt  High fuel prices hit BA's profits   
4  business  005.txt  Pernod takeover talk lifts Domecq   

                                             content  
0   Quarterly profits at US media giant TimeWarne...  
1   The dollar has hit its highest level against ...  
2   The owners of embattled Russian oil giant Yuk...  
3   British Airways has blamed high fuel prices f...  
4   Shares in UK drinks and food firm Allied Dome...  
Index(['category', 'filename', 'title', 'content'], dtype='object')


In [2]:
# Now filtering categories to extract Sports and Politics news
sports_df = df[df["category"] == "sport"]
politics_df = df[df["category"] == "politics"]

#Printing number of articles in sports and politics
print("Sports articles:", len(sports_df))
print("Politics articles:", len(politics_df))


Sports articles: 511
Politics articles: 417


In [3]:
sports_texts = sports_df["content"].tolist()
politics_texts = politics_df["content"].tolist()

with open("data/sports.txt", "w", encoding="utf-8") as f:
    for article in sports_texts:
        f.write(article.replace("\n", " ") + "\n")

with open("data/politics.txt", "w", encoding="utf-8") as f:
    for article in politics_texts:
        f.write(article.replace("\n", " ") + "\n")

print("Files created successfully.")


Files created successfully.


In [4]:
# Loading Data

sports_file = "data/sports.txt"
politics_file = "data/politics.txt"

document_lines = []
labels = []

# Label convention:
# 1 = Sports
# 0 = Politics

# Reading sports documents
with open(sports_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            document_lines.append(line)
            labels.append(1)

# Reading politics documents
with open(politics_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:
            document_lines.append(line)
            labels.append(0)

print("Total lines:", len(document_lines))
print("Sports lines:", labels.count(1))
print("Politics lines:", labels.count(0))


Total lines: 928
Sports lines: 511
Politics lines: 417


In [5]:
#The dataset contains 511 sports articles and 417 politics articles, resulting in a mildly imbalanced distribution (~55% sports, ~45% politics). Stratified train-test splitting was used to preserve class proportions.

## Test train split

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    document_lines,
    labels,
    test_size=0.2,
    random_state=42,
    stratify=labels   # keeps class balance
)

print("Training samples:", len(X_train))
print("Testing samples:", len(X_test))


Training samples: 742
Testing samples: 186


In [7]:
# TF-IDF representation was chosen as it provides better weighting of informative terms compared to raw frequency counts.

## Using TF-IDF feature representation

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF with unigrams only
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))  # unigrams only

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

print("TF-IDF feature shape:", X_train_tfidf.shape) #prints training samples and vocabulary size


TF-IDF feature shape: (742, 15219)


## Model 1: Naive Bayes

In [9]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)

print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_nb))


Naive Bayes Accuracy: 0.9946236559139785

Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.99      0.99        84
           1       0.99      1.00      1.00       102

    accuracy                           0.99       186
   macro avg       1.00      0.99      0.99       186
weighted avg       0.99      0.99      0.99       186



## Model 2: Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9946236559139785

Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.99      0.99        84
           1       0.99      1.00      1.00       102

    accuracy                           0.99       186
   macro avg       1.00      0.99      0.99       186
weighted avg       0.99      0.99      0.99       186



## Model 3: Linear SVM

In [11]:
from sklearn.svm import LinearSVC

svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

y_pred_svm = svm_model.predict(X_test_tfidf)

print("Linear SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_svm))


Linear SVM Accuracy: 1.0

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        84
           1       1.00      1.00      1.00       102

    accuracy                           1.00       186
   macro avg       1.00      1.00      1.00       186
weighted avg       1.00      1.00      1.00       186





## Model 4: Bi-grams + SVM 
(TF-IDF with unigram + Bigram instead of TF_IDF for feature extraction)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF with unigrams + bigrams
tfidf_bigram = TfidfVectorizer(ngram_range=(1,2))

X_train_bigram = tfidf_bigram.fit_transform(X_train)
X_test_bigram = tfidf_bigram.transform(X_test)

print("TF-IDF (1,2) feature shape:", X_train_bigram.shape)


TF-IDF (1,2) feature shape: (742, 142149)


In [13]:
# Vocabulary is larger now because it includes phrases now

In [14]:
# Training SVM
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

svm_bigram = LinearSVC()
svm_bigram.fit(X_train_bigram, y_train)

y_pred_bigram = svm_bigram.predict(X_test_bigram)

print("Linear SVM (TF-IDF 1,2) Accuracy:", accuracy_score(y_test, y_pred_bigram))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred_bigram))


Linear SVM (TF-IDF 1,2) Accuracy: 1.0

Classification Report:

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        84
           1       1.00      1.00      1.00       102

    accuracy                           1.00       186
   macro avg       1.00      1.00      1.00       186
weighted avg       1.00      1.00      1.00       186



