In [1]:
!pip install -q datasets nltk scikit-learn gensim

from datasets import load_dataset
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from gensim.models import KeyedVectors

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')
nltk.download('wordnet')

dataset = load_dataset("trec", trust_remote_code=True)
train_data = dataset['train']
test_data = dataset['test']

X_train = [x['text'] for x in train_data]
y_train = [x['coarse_label'] for x in train_data]
X_test = [x['text'] for x in test_data]
y_test = [x['coarse_label'] for x in test_data]


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aiklavya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aiklavya\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess(texts, method='none'):
    processed_texts = []
    for text in texts:
        words = text.lower().split()
        words = [w for w in words if w.isalpha() and w not in stop_words]
        if method == 'stem':
            words = [stemmer.stem(w) for w in words]
        elif method == 'lemma':
            words = [lemmatizer.lemmatize(w) for w in words]
        # else: keep as is
        processed_texts.append(' '.join(words))
    return processed_texts


In [3]:
def vectorize(train, test, method='bow'):
    if method == 'bow':
        vec = CountVectorizer()
    elif method == 'tfidf':
        vec = TfidfVectorizer()
    else:
        raise ValueError("Unsupported vectorization method")
    
    X_train_vec = vec.fit_transform(train)
    X_test_vec = vec.transform(test)
    return X_train_vec, X_test_vec


In [4]:
def classify(X_train_vec, y_train, X_test_vec, y_test, model_type='logreg'):
    if model_type == 'logreg':
        model = LogisticRegression(max_iter=1000)
    elif model_type == 'dt':
        model = DecisionTreeClassifier()
    elif model_type == 'rf':
        model = RandomForestClassifier()
    elif model_type == 'svm':
        model = SVC()
    else:
        raise ValueError("Unsupported model type")
    
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    acc = accuracy_score(y_test, preds)
    return acc


In [5]:
preprocess_options = ['none', 'stem', 'lemma']
vectorizer_options = ['bow', 'tfidf']
classifier_options = ['logreg', 'dt', 'rf', 'svm']

results = []

for prep in preprocess_options:
    X_train_prep = preprocess(X_train, prep)
    X_test_prep = preprocess(X_test, prep)
    
    for vec in vectorizer_options:
        X_train_vec, X_test_vec = vectorize(X_train_prep, X_test_prep, vec)
        
        for clf in classifier_options:
            acc = classify(X_train_vec, y_train, X_test_vec, y_test, clf)
            results.append({
                'Preprocessing': prep,
                'Vectorizer': vec,
                'Classifier': clf,
                'Accuracy': acc
            })
            print(f'{prep} | {vec} | {clf} => Accuracy: {acc:.4f}')


none | bow | logreg => Accuracy: 0.7560
none | bow | dt => Accuracy: 0.7180
none | bow | rf => Accuracy: 0.7160
none | bow | svm => Accuracy: 0.7300
none | tfidf | logreg => Accuracy: 0.7600
none | tfidf | dt => Accuracy: 0.7240
none | tfidf | rf => Accuracy: 0.7340
none | tfidf | svm => Accuracy: 0.7320
stem | bow | logreg => Accuracy: 0.7520
stem | bow | dt => Accuracy: 0.7160
stem | bow | rf => Accuracy: 0.7260
stem | bow | svm => Accuracy: 0.7300
stem | tfidf | logreg => Accuracy: 0.7500
stem | tfidf | dt => Accuracy: 0.6880
stem | tfidf | rf => Accuracy: 0.7480
stem | tfidf | svm => Accuracy: 0.7280
lemma | bow | logreg => Accuracy: 0.7540
lemma | bow | dt => Accuracy: 0.7300
lemma | bow | rf => Accuracy: 0.7100
lemma | bow | svm => Accuracy: 0.7160
lemma | tfidf | logreg => Accuracy: 0.7480
lemma | tfidf | dt => Accuracy: 0.7260
lemma | tfidf | rf => Accuracy: 0.7420
lemma | tfidf | svm => Accuracy: 0.7260


In [12]:
df_results = pd.DataFrame(results)
df_results.sort_values(by='Accuracy', ascending=False, inplace=True)
df_results


Unnamed: 0,Preprocessing,Vectorizer,Classifier,Accuracy
4,none,tfidf,logreg,0.76
0,none,bow,logreg,0.756
16,lemma,bow,logreg,0.754
8,stem,bow,logreg,0.752
12,stem,tfidf,logreg,0.75
14,stem,tfidf,rf,0.748
20,lemma,tfidf,logreg,0.748
22,lemma,tfidf,rf,0.742
6,none,tfidf,rf,0.734
7,none,tfidf,svm,0.732
