# 20-newsgroups Text Classification using Word2Vec/GloVe and also applying machine learning models (classification models)

importing the necessary packages

In [3]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
import nltk
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize

ModuleNotFoundError: No module named 'gensim'

In [None]:
nltk.download('punkt')

In [None]:
#taking a few topics from the dataset.
categories=['comp.graphics','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','rec.autos','rec.motorcycles','sci.space','talk.religion.misc']
#loading the categories from the dataset
newsgroups_data=fetch_20newsgroups(categories=categories)

preprocessing the dataset

In [None]:
docs=newsgroups_data.data
labels=newsgroups_data.target
target_names=newsgroups_data.target_names

In [None]:
X_train,X_test,y_train,y_test=train_test_split(docs,labels,test_size=0.4,random_state=42)

Training Word2Vec embeddings

In [None]:
X_train_tokenized=[word_tokenize(doc.lower()) for doc in X_train]
w2v_model=Word2Vec(sentences=X_train_tokenized,window=10,min_count=1)

In [None]:
X_test_tokenized=[word_tokenize(doc.lower()) for doc in X_test]

In [None]:
train_vecs=[np.mean([w2v_model.wv[word] for word in doc],axis=0)for doc in X_train_tokenized]
test_vecs=[np.mean([w2v_model.wv[word] for word in doc if word in w2v_model.wv],axis=0) for doc in X_test_tokenized]

In [None]:
def perform_grid_search(classifier, param_grid, X_train, y_train, X_test):
    """summary for perform_grid_search

    Args:
        classifier: the classifiers being passed
        param_grid:
        X_train: contains the training datat
        y_train: contains the training target values
        X_test:contains the test data

    Returns:
        [type]: [description]
    """
    grid_search=GridSearchCV(classifier, param_grid)
    grid_search.fit(X_train, y_train)
    best_model=grid_search.best_estimator_
    y_pred=best_model.predict(X_test)
    return y_pred, grid_search.best_params_

In [None]:
# Define the hyperparameter grids for each classifier
logreg_param_grid={'C':[0.1,1,5,10]}
svm_param_grid={'C':[0.1,1,5,10], 'kernel': ['linear', 'rbf','poly']}
rf_param_grid={'n_estimators':[100,200,300,400]}

In [None]:
logreg_classifier=LogisticRegression()
y_pred_logreg, best_params_logreg=perform_grid_search(logreg_classifier, logreg_param_grid, train_vecs, y_train,test_vecs)

In [None]:
svm_classifier=SVC()
y_pred_svm,best_params_svm=perform_grid_search(svm_classifier,svm_param_grid,train_vecs,y_train,test_vecs)

In [None]:
rf_classifier=RandomForestClassifier()
y_pred_rf,best_params_rf=perform_grid_search(rf_classifier,rf_param_grid,train_vecs,y_train,test_vecs)

evaluating the models

In [None]:
print("Logistic Regression:")
print(classification_report(y_test, y_pred_logreg, target_names=target_names))
print("Best Parameters:", best_params_logreg)

In [None]:
print("Support Vector Machine:")
print(classification_report(y_test, y_pred_svm, target_names=target_names))
print("Best Parameters:", best_params_svm)

In [None]:
print("Random Forest:")
print(classification_report(y_test, y_pred_rf, target_names=target_names))