# 20-newsgroups Text Classification using Word2Vec/GloVe and also applying machine learning models (classification models)

importing the necessary packages

In [70]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec

In [86]:
#taking a few topics from the dataset.
categories=['comp.graphics','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','rec.autos','rec.motorcycles','sci.space']
#loading the categories from the dataset
newsgroups_data=fetch_20newsgroups(categories=categories)

preprocessing the dataset

In [87]:
#using the TfIdF vecotizer
vectorizer=TfidfVectorizer()
X=vectorizer.fit_transform(newsgroups_data.data)
y=newsgroups_data.target

In [88]:
#splitting the dataset
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

Training Word2Vec embeddings

In [89]:
sentences=[doc.split() for doc in newsgroups_data.data]
w2v_model=Word2Vec(sentences,window=10,min_count=1)

In [90]:
#generating the document embeddings using word2vec
def generate_doc_embeddings(docs):
    """to generate the document embeddings

    Args:
        docs ([string]): [contains the documents that are in the dataset ]
    """
    doc_embeddings = []
    if isinstance(docs, np.ndarray):
        for doc in docs:
            words = doc.split()
            embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
            if embeddings:
                doc_embeddings.append(np.mean(embeddings, axis=0))
    else:  # Handling sparse matrices
        for doc in docs:
            words = doc.toarray().tolist()[0]
            embeddings = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
            if embeddings:
                doc_embeddings.append(np.mean(embeddings, axis=0))
            else:
                doc_embeddings.append(np.zeros(100))  # If no words are present in Word2Vec vocabulary, use zero vector
    return np.array(doc_embeddings)

In [91]:
def perform_grid_search(classifier, param_grid, X_train, y_train, X_test):
    """summary for perform_grid_search

    Args:
        classifier: the classifiers being passed
        param_grid:
        X_train: contains the training datat
        y_train: contains the training target values
        X_test:contains the test data

    Returns:
        [type]: [description]
    """
    grid_search=GridSearchCV(classifier, param_grid)
    grid_search.fit(X_train, y_train)
    best_model=grid_search.best_estimator_
    y_pred=best_model.predict(X_test)
    return y_pred, grid_search.best_params_

In [92]:
X_train_w2v=generate_doc_embeddings(X_train)
X_test_w2v=generate_doc_embeddings(X_test)

In [93]:
print("X_train_w2v")
for x in range(len(X_train_w2v)):
    print(X_train_w2v[x])

X_train_w2v
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 

In [94]:
# Define the hyperparameter grids for each classifier
logreg_param_grid={'C': [0.1, 1, 10]}
svm_param_grid={'C': [0.1, 1, 10], 'kernel': ['linear', 'rbf']}
rf_param_grid={'n_estimators': [100, 200, 300]}

In [95]:
logreg_classifier=LogisticRegression()
y_pred_logreg, best_params_logreg=perform_grid_search(logreg_classifier, logreg_param_grid, X_train_w2v, y_train,X_test_w2v)

In [96]:
svm_classifier=SVC()
y_pred_svm,best_params_svm=perform_grid_search(svm_classifier,svm_param_grid,X_train_w2v,y_train,X_test_w2v)

In [97]:
rf_classifier=RandomForestClassifier()
y_pred_rf,best_params_rf=perform_grid_search(rf_classifier,rf_param_grid,X_train_w2v,y_train,X_test_w2v)

evaluating the models

In [98]:
print("Logistic Regression:")
print(classification_report(y_test, y_pred_logreg, target_names=newsgroups_data.target_names))
print("Best Parameters:", best_params_logreg)

Logistic Regression:
                          precision    recall  f1-score   support

           comp.graphics       0.00      0.00      0.00       109
comp.sys.ibm.pc.hardware       0.00      0.00      0.00       123
   comp.sys.mac.hardware       0.00      0.00      0.00       105
               rec.autos       0.00      0.00      0.00       118
         rec.motorcycles       0.17      1.00      0.29       118
               sci.space       0.00      0.00      0.00       135

                accuracy                           0.17       708
               macro avg       0.03      0.17      0.05       708
            weighted avg       0.03      0.17      0.05       708

Best Parameters: {'C': 0.1}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [99]:
print("Support Vector Machine:")
print(classification_report(y_test, y_pred_svm, target_names=newsgroups_data.target_names))
print("Best Parameters:", best_params_svm)

Support Vector Machine:
                          precision    recall  f1-score   support

           comp.graphics       0.00      0.00      0.00       109
comp.sys.ibm.pc.hardware       0.00      0.00      0.00       123
   comp.sys.mac.hardware       0.00      0.00      0.00       105
               rec.autos       0.00      0.00      0.00       118
         rec.motorcycles       0.17      1.00      0.29       118
               sci.space       0.00      0.00      0.00       135

                accuracy                           0.17       708
               macro avg       0.03      0.17      0.05       708
            weighted avg       0.03      0.17      0.05       708

Best Parameters: {'C': 0.1, 'kernel': 'linear'}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [100]:
print("Random Forest:")
print(classification_report(y_test, y_pred_rf, target_names=newsgroups_data.target_names))

Random Forest:
                          precision    recall  f1-score   support

           comp.graphics       0.00      0.00      0.00       109
comp.sys.ibm.pc.hardware       0.00      0.00      0.00       123
   comp.sys.mac.hardware       0.00      0.00      0.00       105
               rec.autos       0.00      0.00      0.00       118
         rec.motorcycles       0.17      1.00      0.29       118
               sci.space       0.00      0.00      0.00       135

                accuracy                           0.17       708
               macro avg       0.03      0.17      0.05       708
            weighted avg       0.03      0.17      0.05       708



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
