In [7]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adityanarayan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [8]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize


In [9]:
# Load the 20 Newsgroups dataset
categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware',
              'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles',
              'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med',
              'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast',
              'talk.politics.misc', 'talk.religion.misc']
newsgroups = fetch_20newsgroups(subset='all', categories=categories)

In [10]:
# Preprocess the text
documents = newsgroups.data
labels = newsgroups.target

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(documents, labels, test_size=0.2, random_state=42)

In [12]:
# Tokenize the documents
tokenized_train = [word_tokenize(doc.lower()) for doc in X_train]
tokenized_test = [word_tokenize(doc.lower()) for doc in X_test]

In [14]:
# Train Word2Vec model
model = Word2Vec(sentences=tokenized_train, window=5, min_count=1)

In [15]:
# Prepare document vectors
train_vectors = [np.mean([model.wv[word] for word in doc], axis=0) for doc in tokenized_train]
test_vectors = [np.mean([model.wv[word] for word in doc if word in model.wv], axis=0) for doc in tokenized_test]

In [21]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

In [22]:
# Define a list of classifiers
classifiers = [
    RandomForestClassifier(),
    SVC(kernel='linear'),
    LogisticRegression(),
    XGBClassifier()
]

In [24]:
# Function to print classifier reports
def print_classifier_report(classifier):
    classifier.fit(train_vectors, y_train)
    predictions = classifier.predict(test_vectors)
    report = classification_report(y_test, predictions)
    print(f"Classifier: {classifier.__class__.__name__}")
    print(report)
    print("=" * 80)

In [26]:
print_classifier_report(classifiers[0])

Classifier: RandomForestClassifier
              precision    recall  f1-score   support

           0       0.34      0.42      0.38       151
           1       0.32      0.34      0.33       202
           2       0.37      0.42      0.40       195
           3       0.35      0.43      0.39       183
           4       0.41      0.31      0.35       205
           5       0.44      0.42      0.43       215
           6       0.63      0.71      0.67       193
           7       0.45      0.49      0.47       196
           8       0.36      0.47      0.41       168
           9       0.45      0.42      0.44       211
          10       0.57      0.69      0.62       198
          11       0.53      0.61      0.57       201
          12       0.40      0.30      0.34       202
          13       0.50      0.45      0.47       194
          14       0.49      0.43      0.46       189
          15       0.54      0.70      0.61       202
          16       0.43      0.38      0.41   

In [27]:
print_classifier_report(classifiers[1])

Classifier: SVC
              precision    recall  f1-score   support

           0       0.45      0.62      0.52       151
           1       0.43      0.46      0.45       202
           2       0.57      0.60      0.58       195
           3       0.42      0.49      0.45       183
           4       0.49      0.38      0.43       205
           5       0.62      0.59      0.60       215
           6       0.72      0.74      0.73       193
           7       0.56      0.59      0.57       196
           8       0.60      0.65      0.62       168
           9       0.61      0.65      0.63       211
          10       0.71      0.68      0.70       198
          11       0.73      0.70      0.72       201
          12       0.48      0.43      0.46       202
          13       0.66      0.64      0.65       194
          14       0.67      0.72      0.70       189
          15       0.65      0.81      0.72       202
          16       0.58      0.64      0.61       188
          1

In [28]:
print_classifier_report(classifiers[2])

Classifier: LogisticRegression
              precision    recall  f1-score   support

           0       0.40      0.46      0.43       151
           1       0.46      0.43      0.44       202
           2       0.52      0.59      0.56       195
           3       0.42      0.42      0.42       183
           4       0.51      0.36      0.42       205
           5       0.57      0.58      0.57       215
           6       0.72      0.70      0.71       193
           7       0.54      0.48      0.51       196
           8       0.52      0.67      0.58       168
           9       0.59      0.56      0.57       211
          10       0.63      0.69      0.66       198
          11       0.68      0.69      0.68       201
          12       0.49      0.43      0.46       202
          13       0.56      0.59      0.57       194
          14       0.57      0.71      0.63       189
          15       0.59      0.78      0.67       202
          16       0.54      0.55      0.54       

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [29]:
print_classifier_report(classifiers[3])

Classifier: XGBClassifier
              precision    recall  f1-score   support

           0       0.41      0.50      0.45       151
           1       0.32      0.34      0.33       202
           2       0.46      0.46      0.46       195
           3       0.35      0.42      0.38       183
           4       0.37      0.34      0.36       205
           5       0.56      0.48      0.52       215
           6       0.73      0.67      0.70       193
           7       0.50      0.51      0.50       196
           8       0.52      0.58      0.55       168
           9       0.56      0.49      0.52       211
          10       0.63      0.68      0.65       198
          11       0.69      0.64      0.66       201
          12       0.36      0.39      0.38       202
          13       0.55      0.59      0.57       194
          14       0.57      0.58      0.57       189
          15       0.60      0.70      0.65       202
          16       0.51      0.49      0.50       188
 