In [151]:
#Necessary import 
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import spacy
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from collections import Counter
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
import re
from concurrent.futures import ProcessPoolExecutor
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/bishi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/bishi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/bishi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [101]:
newsgroups_data = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [102]:
df = pd.DataFrame({'text': newsgroups_data.data, 'target_id': newsgroups_data.target})
df['target_name'] = df['target_id'].apply(lambda x: newsgroups_data.target_names[x])

In [103]:
#Display the number of samples 
df.shape

(18846, 3)

In [104]:
#Show the 20 categories names
df['target_name'].unique()

array(['rec.sport.hockey', 'comp.sys.ibm.pc.hardware',
       'talk.politics.mideast', 'comp.sys.mac.hardware',
       'sci.electronics', 'talk.religion.misc', 'sci.crypt', 'sci.med',
       'alt.atheism', 'rec.motorcycles', 'rec.autos', 'comp.windows.x',
       'comp.graphics', 'sci.space', 'talk.politics.guns', 'misc.forsale',
       'rec.sport.baseball', 'talk.politics.misc',
       'comp.os.ms-windows.misc', 'soc.religion.christian'], dtype=object)

In [105]:
df.target_name.unique()

array(['rec.sport.hockey', 'comp.sys.ibm.pc.hardware',
       'talk.politics.mideast', 'comp.sys.mac.hardware',
       'sci.electronics', 'talk.religion.misc', 'sci.crypt', 'sci.med',
       'alt.atheism', 'rec.motorcycles', 'rec.autos', 'comp.windows.x',
       'comp.graphics', 'sci.space', 'talk.politics.guns', 'misc.forsale',
       'rec.sport.baseball', 'talk.politics.misc',
       'comp.os.ms-windows.misc', 'soc.religion.christian'], dtype=object)

In [106]:
#display the first 5 lines from the dataset
df.head()

Unnamed: 0,text,target_id,target_name
0,\n\nI am sure some bashers of Pens fans are pr...,10,rec.sport.hockey
1,My brother is in the market for a high-perform...,3,comp.sys.ibm.pc.hardware
2,\n\n\n\n\tFinally you said what you dream abou...,17,talk.politics.mideast
3,\nThink!\n\nIt's the SCSI card doing the DMA t...,3,comp.sys.ibm.pc.hardware
4,1) I have an old Jasmine drive which I cann...,4,comp.sys.mac.hardware


First proceeding

Data Preprocessing

In [107]:
X = df.text
y = df.target_id
X_train, y_train, X_test, y_test = train_test_split(X, y, test_size=0.3, train_size=0.7, random_state=48)

Lowercasing

In [108]:
df['text'] = df['text'].str.lower()

In [109]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

Removing punctuation, special characters

In [110]:
df['text'] = df['text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

Tokenization

In [111]:
df['tokens'] = df['text'].apply(nltk.word_tokenize)

Stop word removal

In [112]:
stop_words = set(stopwords.words('english'))
df['tokens'] = df['tokens'].apply(lambda tokens: [t for t in tokens if t not in stop_words])

Stemming/Lemmatization

In [113]:
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(t) for t in tokens])

In [114]:
df['text_clean'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))

Vectorization

In [115]:
X_train, X_test, y_train, y_test = train_test_split(df['text_clean'], df['target_id'], test_size=0.3, random_state=42, stratify=df['target_id'])

In [116]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train) 
X_test = vectorizer.transform(X_test) 

Training

In [117]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

Evaluation

In [118]:
print(classification_report(y_test, y_pred, target_names=newsgroups_data.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.69      0.37      0.48       240
           comp.graphics       0.53      0.74      0.62       292
 comp.os.ms-windows.misc       0.94      0.22      0.36       296
comp.sys.ibm.pc.hardware       0.62      0.73      0.67       295
   comp.sys.mac.hardware       0.85      0.65      0.74       289
          comp.windows.x       0.62      0.85      0.72       296
            misc.forsale       0.88      0.65      0.75       293
               rec.autos       0.82      0.71      0.76       297
         rec.motorcycles       0.95      0.58      0.72       299
      rec.sport.baseball       0.93      0.74      0.82       298
        rec.sport.hockey       0.57      0.89      0.69       300
               sci.crypt       0.71      0.77      0.74       297
         sci.electronics       0.76      0.61      0.68       295
                 sci.med       0.87      0.82      0.84       297
         

We notice that with minimal text preprocessing, the model performed well on some categories and poorly on some others. 
We can inspect those categories more and try to do more preprocessing to clean the text and extract more insights from it. 

In [119]:
low_acc = ['alt.atheism','comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.windows.x', 'rec.autos', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc']
high_acc = ['comp.os.ms-windows.misc', 'comp.sys.mac.hardware', 'misc.forsale', 'rec.motorcycles', 'rec.sport.baseball', 'talk.religion.misc']

In [120]:
df_low = df[df['target_name'].isin(low_acc)]
print(df_low['text'].sample(10, random_state=42).values)

['keith\n\ni had a problem getting 256 colors i was stuck with 16 even though\nthe flexstuff said i was at 1024256  i solved it by entering\nthe advanced window on the flex program pannel and changing the\ncolor palette  sorry for the vaugeness i hope it helps some\n\nbtw i have a gw200066v and 1m ati gup'
 'and the lords servant must not quarrel instead he must be kind to everyone\nable to teach not resentful those who oppose him he must gently instruct in\nthe hope that god will grant them repentance leading them to a knowledge of the\ntruth and that they will come to their senses and escape from the trap of the\ndevil who has taken them captive to do his will \niitimothy 22426\n'
 '\nso you think a 93 mustang cobra can match the performance of a new z28\ninteresting belief \n\ncraig\n\nwho neither owns nor wants to own any gm or ford product'
 '\n\n\n\nthe fact that she was wearing a miniskirt with no underwear was\npresented as evidence that she was a prostitute and the court\nappa

In [121]:
df_high = df[df['target_name'].isin(high_acc)]
print(df_high['text'].sample(10, random_state=42).values)

['\ni dont think ms has anything to brag about when it comes to following\ndpmi but then consistency is the hobgoblin etc i suppose'
 '\n\n'
 'hello\n\n\twho can tell me   where can i find the pd or shareware   \nwhich can capture windows 31s output of printer mananger\n\n\ti want to capture the output of hp laser jet iii\n\n\tthough the postscript can setup to print to filebut hp cant\n\n\ti try doss redirect programbut they cant work in windows 31\n\n\t\tthankx for any help\n\n\n internet address u7911093ccnctuedutw\n\n    english name erik wang\n    chinese name wang jyhshyang'
 'mitsbishi laptop mp 286l\n\n28612 1286 mhz switchable\n2m ram installed\nbacklit cga ext cga mga\n20m 35hh hdd144m 35 fdd\n2 com1 lpt ports\ncomplete manual set\nbuilt like a tank\nexcellent cosmetic cond\ndark gray\nused very lightly\n\nproblems\n1hdd stops working\n2lcd sometimes doesnt work ext cagmga works'
 '\n\n\n200 in glassboro new jersey  \n'
 '\nok here are some usefull applications and locations 

In [122]:
low_texts = ' '.join(df_low['text_clean'].values)
low_words = Counter(low_texts.split())
print(low_words.most_common(20))

[('one', 7277), ('would', 7048), ('x', 6130), ('people', 5066), ('1', 4911), ('dont', 4374), ('know', 4259), ('like', 4219), ('get', 4054), ('0', 3996), ('2', 3929), ('time', 3802), ('also', 3732), ('think', 3537), ('use', 3449), ('u', 3109), ('say', 2980), ('file', 2956), ('make', 2900), ('could', 2879)]


In [123]:
high_texts = ' '.join(df_high['text_clean'].values)
high_words = Counter(high_texts.split())
print(high_words.most_common(20))

[('maxaxaxaxaxaxaxaxaxaxaxaxaxaxax', 3307), ('one', 2296), ('1', 2104), ('would', 1957), ('get', 1576), ('like', 1541), ('2', 1540), ('dont', 1431), ('know', 1379), ('window', 1308), ('time', 1143), ('new', 1116), ('good', 1107), ('think', 1102), ('also', 1077), ('im', 1066), ('year', 1048), ('use', 1004), ('file', 987), ('problem', 986)]


We notice that words that frequently appear in both categories are not very informative about the actual category

We can do another analysis of the most frequent words in each category

In [124]:
cats = ['alt.atheism','comp.graphics', 'comp.sys.ibm.pc.hardware', 'comp.windows.x', 'rec.autos', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc','comp.os.ms-windows.misc', 'comp.sys.mac.hardware', 'misc.forsale', 'rec.motorcycles', 'rec.sport.baseball', 'talk.religion.misc']

for cat in cats: 
    df_ = df[df['target_name']== cat]
    texts = ' '.join(df_['text_clean'].values)
    words = Counter(texts.split())
    if cat in low_acc:
        acc = 'low_acc'
    else: 
        acc = 'high acc'
    print(f'the {acc} category: {cat}\n the most common words: {words.most_common(20)}')

the low_acc category: alt.atheism
 the most common words: [('one', 692), ('god', 692), ('would', 496), ('people', 495), ('dont', 435), ('say', 400), ('think', 374), ('atheist', 363), ('religion', 303), ('belief', 297), ('know', 297), ('make', 293), ('believe', 282), ('like', 273), ('argument', 273), ('thing', 272), ('time', 271), ('many', 268), ('even', 244), ('way', 241)]
the low_acc category: comp.graphics
 the most common words: [('image', 1599), ('file', 1038), ('jpeg', 670), ('format', 627), ('program', 608), ('graphic', 592), ('also', 499), ('available', 454), ('system', 443), ('software', 443), ('data', 439), ('would', 438), ('use', 437), ('color', 416), ('one', 409), ('version', 393), ('get', 373), ('display', 358), ('like', 346), ('gif', 339)]
the low_acc category: comp.sys.ibm.pc.hardware
 the most common words: [('drive', 978), ('card', 532), ('one', 437), ('system', 419), ('would', 384), ('disk', 381), ('scsi', 372), ('controller', 355), ('problem', 352), ('use', 347), ('ge

Most of these categories have highly relevant keywords to the catogory's topics, which is a good sign, however there are some generic words that appear in every category and do not carry strong topical information, we can remove them.

In [125]:
stop_words_ext = set(stopwords.words('english'))

extra_stopwords = ["one", "would", "people", "say", "get", "know", "like", "use", "think", "could", "also", "year", "make", "dont", "time", "good", "many", "way"]
stop_words_ext.update(extra_stopwords)

df['text_clean'] = df['text_clean'].apply(lambda text: ' '.join([w for w in text.split() if w not in stop_words_ext]))

We can go the extra mile and do more specific analysis by calculating the ratio of verbs and nouns in each category, this way we can note if certain categories have noticeably higher noun or verb ratios. 
We can also use these as extra features, but it's not possible with Naive Bayes classifier. 

In [127]:
nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"])

def nouns_ratio(text):
    doc = nlp(text)
    total = len(doc)
    num_nv = sum(1 for token in doc if token.pos_ in ['VERB'])
    return num_nv / total if total > 0 else 0
def verbs_ratio(text):
    doc = nlp(text)
    total = len(doc)
    num_nv = sum(1 for token in doc if token.pos_ in ['VERB'])
    return num_nv / total if total > 0 else 0
df['nouns_ratio'] = df['text_clean'].apply(nouns_ratio)
df['verbs_ratio'] = df['text_clean'].apply(verbs_ratio)

In [128]:
noun_ratio_per_cat = df.groupby('target_name')['nouns_ratio'].mean().sort_values()
verb_ratio_per_cat = df.groupby('target_name')['verbs_ratio'].mean().sort_values()

print("Noun ratios by category:\n", noun_ratio_per_cat)
print("\nVerb ratios by category:\n", verb_ratio_per_cat)

Noun ratios by category:
 target_name
misc.forsale                0.136816
comp.sys.mac.hardware       0.171935
sci.electronics             0.172546
rec.sport.hockey            0.174089
comp.os.ms-windows.misc     0.174563
rec.sport.baseball          0.174691
comp.sys.ibm.pc.hardware    0.175830
sci.space                   0.175959
rec.autos                   0.178305
comp.graphics               0.178542
sci.crypt                   0.187456
sci.med                     0.188866
comp.windows.x              0.191285
talk.politics.misc          0.192612
rec.motorcycles             0.193686
talk.politics.mideast       0.198095
talk.religion.misc          0.202052
talk.politics.guns          0.203956
soc.religion.christian      0.207637
alt.atheism                 0.207963
Name: nouns_ratio, dtype: float64

Verb ratios by category:
 target_name
misc.forsale                0.136816
comp.sys.mac.hardware       0.171935
sci.electronics             0.172546
rec.sport.hockey            0.174089
c

We notice that these ratios are small, this means there are too many other words with different POS tags that are frequent in each category.
We can try to keep only three POS tags : Noun, Verbs and Adjectives, because they are more linguistically rich features. 

In [129]:
nlp = spacy.load('en_core_web_sm', disable=["parser", "ner"])
def filter_pos(text, pos_to_keep=['NOUN', 'VERB', 'ADJ']):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if token.pos_ in pos_to_keep]
    return ' '.join(tokens)

df['text_pos_filtered'] = df['text_clean'].apply(lambda text: filter_pos(text, ['NOUN', 'VERB', 'ADJ']))

In [138]:
X_train, X_test, y_train, y_test = train_test_split(df['text_clean'], df['target_id'], test_size=0.3, random_state=42, stratify=df['target_id'])

vectorizer_count = CountVectorizer()
X_train_count = vectorizer_count.fit_transform(X_train)
X_test_count = vectorizer_count.transform(X_test)

vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

In [139]:
# Using CountVectorizer features
clf_count = MultinomialNB()
clf_count.fit(X_train_count, y_train)
y_pred_count = clf_count.predict(X_test_count)


print("CountVectorizer Results with pos filtering:")
print(classification_report(y_test, y_pred_count, target_names=df['target_name'].unique()))

CountVectorizer Results with pos filtering:
                          precision    recall  f1-score   support

        rec.sport.hockey       0.67      0.38      0.48       240
comp.sys.ibm.pc.hardware       0.53      0.74      0.62       292
   talk.politics.mideast       0.90      0.22      0.35       296
   comp.sys.mac.hardware       0.62      0.73      0.67       295
         sci.electronics       0.83      0.66      0.74       289
      talk.religion.misc       0.61      0.85      0.71       296
               sci.crypt       0.87      0.65      0.75       293
                 sci.med       0.83      0.72      0.77       297
             alt.atheism       0.95      0.60      0.73       299
         rec.motorcycles       0.95      0.74      0.83       298
               rec.autos       0.57      0.90      0.70       300
          comp.windows.x       0.73      0.78      0.75       297
           comp.graphics       0.77      0.62      0.69       295
               sci.space       

In [140]:
# Using TfidfVectorizer features
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)


print("TfidfVectorizer Results with pos filtering:")
print(classification_report(y_test, y_pred_tfidf, target_names=df['target_name'].unique()))

TfidfVectorizer Results with pos filtering:
                          precision    recall  f1-score   support

        rec.sport.hockey       0.78      0.28      0.41       240
comp.sys.ibm.pc.hardware       0.70      0.68      0.69       292
   talk.politics.mideast       0.72      0.66      0.68       296
   comp.sys.mac.hardware       0.63      0.77      0.69       295
         sci.electronics       0.85      0.66      0.74       289
      talk.religion.misc       0.81      0.85      0.83       296
               sci.crypt       0.83      0.73      0.78       293
                 sci.med       0.81      0.76      0.79       297
             alt.atheism       0.89      0.70      0.78       299
         rec.motorcycles       0.93      0.80      0.86       298
               rec.autos       0.57      0.94      0.71       300
          comp.windows.x       0.68      0.85      0.75       297
           comp.graphics       0.80      0.71      0.75       295
               sci.space       

In [143]:
X_train, X_test, y_train, y_test = train_test_split(df['text_pos_filtered'], df['target_id'], test_size=0.3, random_state=42, stratify=df['target_id'])

vectorizer_count = CountVectorizer()
X_train_count = vectorizer_count.fit_transform(X_train)
X_test_count = vectorizer_count.transform(X_test)

vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(X_train)
X_test_tfidf = vectorizer_tfidf.transform(X_test)

In [144]:
# Using CountVectorizer features
clf_count = MultinomialNB()
clf_count.fit(X_train_count, y_train)
y_pred_count = clf_count.predict(X_test_count)


print("CountVectorizer Results with pos filtering:")
print(classification_report(y_test, y_pred_count, target_names=df['target_name'].unique()))

CountVectorizer Results with pos filtering:
                          precision    recall  f1-score   support

        rec.sport.hockey       0.59      0.38      0.46       240
comp.sys.ibm.pc.hardware       0.53      0.69      0.60       292
   talk.politics.mideast       0.71      0.25      0.37       296
   comp.sys.mac.hardware       0.57      0.63      0.60       295
         sci.electronics       0.75      0.56      0.64       289
      talk.religion.misc       0.54      0.83      0.65       296
               sci.crypt       0.80      0.59      0.68       293
                 sci.med       0.75      0.69      0.72       297
             alt.atheism       0.88      0.52      0.65       299
         rec.motorcycles       0.91      0.65      0.76       298
               rec.autos       0.53      0.89      0.67       300
          comp.windows.x       0.70      0.73      0.72       297
           comp.graphics       0.72      0.59      0.65       295
               sci.space       

In [None]:
# Using TfidfVectorizer features
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)


print("TfidfVectorizer Results with pos filtering:")
print(classification_report(y_test, y_pred_tfidf, target_names=df['target_name'].unique()))

TfidfVectorizer Results:
                          precision    recall  f1-score   support

        rec.sport.hockey       0.73      0.23      0.34       240
comp.sys.ibm.pc.hardware       0.70      0.68      0.69       292
   talk.politics.mideast       0.67      0.54      0.60       296
   comp.sys.mac.hardware       0.55      0.73      0.63       295
         sci.electronics       0.81      0.56      0.66       289
      talk.religion.misc       0.71      0.84      0.77       296
               sci.crypt       0.77      0.71      0.74       293
                 sci.med       0.78      0.72      0.75       297
             alt.atheism       0.84      0.66      0.74       299
         rec.motorcycles       0.89      0.71      0.79       298
               rec.autos       0.52      0.91      0.66       300
          comp.windows.x       0.70      0.79      0.74       297
           comp.graphics       0.74      0.65      0.69       295
               sci.space       0.87      0.78     

After POS-based filtering and TF-IDF vectorization, model accuracy remained 67%. Certain categories (like sci.space) saw strong F1 improvements, while others (soc.religion.christian) dropped dramatically indicating their reliance on non-content words for context. 
However with the additional preprocessing without the pos filtering step the accuracy seems to increase to 71% using TFidf vectorization.

So we're going to try use the POS filtering only to the categories that benefited from it, and apply other preprocessing to the other categories, and see if we can achieve better accuracy or is this pos filtering useless. 

And since the TFidf vectorization seems to perform better everytime we will proceeed with it for the rest of the time. 

In [176]:
# Categories that benefit from POS filtering
pos_filter_cats = [
    'alt.atheism', 'sci.space', 'talk.religion.misc', 'sci.crypt', 'sci.med'
]

# All other categories use standard cleaning
standard_cats = [cat for cat in df['target_name'].unique() if cat not in pos_filter_cats]

In [177]:
def custom_preprocess(row):
    if row['target_name'] in pos_filter_cats:
        doc = nlp(row['text_clean'])
        tokens = [token.lemma_ for token in doc if token.pos_ in ['NOUN', 'VERB', 'ADJ']]
        return ' '.join(tokens)
    else:
        return row['text_clean']

df['text_hybrid'] = df.apply(custom_preprocess, axis=1)

In [178]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text_hybrid'], df['target_id'], test_size=0.3, random_state=42, stratify=df['target_id']
)

vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [179]:
# Using TfidfVectorizer features
clf_tfidf = MultinomialNB()
clf_tfidf.fit(X_train_tfidf, y_train)
y_pred_tfidf = clf_tfidf.predict(X_test_tfidf)


print("TfidfVectorizer Results with pos filtering:")
print(classification_report(y_test, y_pred_tfidf, target_names=df['target_name'].unique()))

TfidfVectorizer Results with pos filtering:
                          precision    recall  f1-score   support

        rec.sport.hockey       0.57      0.66      0.61       240
comp.sys.ibm.pc.hardware       0.64      0.64      0.64       292
   talk.politics.mideast       0.62      0.60      0.61       296
   comp.sys.mac.hardware       0.60      0.68      0.64       295
         sci.electronics       0.77      0.64      0.70       289
      talk.religion.misc       0.73      0.82      0.78       296
               sci.crypt       0.81      0.75      0.78       293
                 sci.med       0.72      0.69      0.70       297
             alt.atheism       0.76      0.69      0.72       299
         rec.motorcycles       0.86      0.79      0.82       298
               rec.autos       0.53      0.91      0.67       300
          comp.windows.x       0.81      0.85      0.83       297
           comp.graphics       0.73      0.65      0.69       295
               sci.space       

No big improvement. but we'll keep it.

In [180]:
train_pred = clf_tfidf.predict(X_train_tfidf)
train_acc = accuracy_score(y_train, train_pred)
print("Training Accuracy:", train_acc)
test_acc = accuracy_score(y_test, y_pred_tfidf)
print("Test Accuracy:", test_acc)

Training Accuracy: 0.8108702243784112
Test Accuracy: 0.7175451008135834


A 10% gap is relatively good knowing that we have 20 classes and high-cardinality text data. But it still indicates fitting to specific training data noise or details.

Trying to increase the accuracy by reducing the vocabulary size (to reduce noise).

In [181]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text_hybrid'], df['target_id'], test_size=0.3, random_state=42, stratify=df['target_id']
)

vectorizer = TfidfVectorizer(ngram_range=(1,4), max_features=3000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [183]:
clf = MultinomialNB()  
clf.fit(X_train_tfidf, y_train)

train_pred = clf.predict(X_train_tfidf)
train_acc = accuracy_score(y_train, train_pred)
print("Training Accuracy:", train_acc)

y_pred = clf.predict(X_test_tfidf)
test_acc = accuracy_score(y_test, y_pred)
print("Test Accuracy:", test_acc)

Training Accuracy: 0.7833535476046088
Test Accuracy: 0.6957905907322249


Using a hybrid preprocessing pipeline, TfidfVectorizer with n-grams up to 4, and max_features=3000, we got:
Training Accuracy: 78.3%
Test Accuracy: 69.6%
Overfitting Gap: around 8.7%

With optimized preprocessing including general preprocessing, selective POS filtering and n-gram TF-IDF vectorization, the Naive Bayes model reached 78% training and 70% test accuracy. This demonstrates that thoughtful feature engineering, even without complecated models can achieve robust performance and reduce overfitting in multiclass text classification. 
We notice that this approach achieved a good balance between accuracy, generalization, and computational efficiency.