# 20-newsgroups Text Classification using Word2Vec/GloVe and also applying machine learning models (classification models)

importing the necessary packages

In [28]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
import nltk
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize

In [29]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
#taking a few topics from the dataset.
categories=['comp.graphics','comp.sys.ibm.pc.hardware','comp.sys.mac.hardware','rec.autos','rec.motorcycles','sci.space','talk.religion.misc']
#loading the categories from the dataset
newsgroups_data=fetch_20newsgroups(categories=categories)

preprocessing the dataset

In [31]:
docs=newsgroups_data.data
labels=newsgroups_data.target
target_names=newsgroups_data.target_names

In [32]:
X_train,X_test,y_train,y_test=train_test_split(docs,labels,test_size=0.35,random_state=42)

Training Word2Vec embeddings

In [33]:
X_train_tokenized=[word_tokenize(doc.lower()) for doc in X_train]
w2v_model=Word2Vec(sentences=X_train_tokenized,window=10,min_count=1)

In [34]:
X_test_tokenized=[word_tokenize(doc.lower()) for doc in X_test]

In [35]:
train_vecs=[np.mean([w2v_model.wv[word] for word in doc],axis=0)for doc in X_train_tokenized]
test_vecs=[np.mean([w2v_model.wv[word] for word in doc if word in w2v_model.wv],axis=0) for doc in X_test_tokenized]

In [36]:
def perform_grid_search(classifier, param_grid, X_train, y_train, X_test):
    """summary for perform_grid_search

    Args:
        classifier: the classifiers being passed
        param_grid:
        X_train: contains the training datat
        y_train: contains the training target values
        X_test:contains the test data

    Returns:
        [type]: [description]
    """
    grid_search=GridSearchCV(classifier, param_grid)
    grid_search.fit(X_train, y_train)
    best_model=grid_search.best_estimator_
    y_pred=best_model.predict(X_test)
    return y_pred, grid_search.best_params_

In [37]:
# Define the hyperparameter grids for each classifier
logreg_param_grid={'C':[0.1,1,5,10]}
svm_param_grid={'C':[0.1,1,5,10], 'kernel': ['linear', 'rbf','poly']}
rf_param_grid={'n_estimators':[100,200,300,400]}

In [38]:
logreg_classifier=LogisticRegression()
y_pred_logreg, best_params_logreg=perform_grid_search(logreg_classifier, logreg_param_grid, train_vecs, y_train,test_vecs)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [39]:
svm_classifier=SVC()
y_pred_svm,best_params_svm=perform_grid_search(svm_classifier,svm_param_grid,train_vecs,y_train,test_vecs)

In [40]:
rf_classifier=RandomForestClassifier()
y_pred_rf,best_params_rf=perform_grid_search(rf_classifier,rf_param_grid,train_vecs,y_train,test_vecs)

evaluating the models

In [41]:
print("Logistic Regression:")
print(classification_report(y_test, y_pred_logreg, target_names=target_names))
print("Best Parameters:", best_params_logreg)

Logistic Regression:
                          precision    recall  f1-score   support

           comp.graphics       0.43      0.47      0.45       187
comp.sys.ibm.pc.hardware       0.45      0.48      0.46       201
   comp.sys.mac.hardware       0.36      0.33      0.34       202
               rec.autos       0.46      0.46      0.46       215
         rec.motorcycles       0.56      0.48      0.52       221
               sci.space       0.58      0.57      0.57       217
      talk.religion.misc       0.56      0.71      0.63       127

                accuracy                           0.49      1370
               macro avg       0.49      0.50      0.49      1370
            weighted avg       0.49      0.49      0.48      1370

Best Parameters: {'C': 5}


In [42]:
print("Support Vector Machine:")
print(classification_report(y_test, y_pred_svm, target_names=target_names))
print("Best Parameters:", best_params_svm)

Support Vector Machine:
                          precision    recall  f1-score   support

           comp.graphics       0.41      0.56      0.47       187
comp.sys.ibm.pc.hardware       0.47      0.52      0.49       201
   comp.sys.mac.hardware       0.42      0.37      0.39       202
               rec.autos       0.50      0.52      0.51       215
         rec.motorcycles       0.60      0.47      0.53       221
               sci.space       0.64      0.59      0.61       217
      talk.religion.misc       0.66      0.65      0.66       127

                accuracy                           0.52      1370
               macro avg       0.53      0.52      0.52      1370
            weighted avg       0.53      0.52      0.52      1370

Best Parameters: {'C': 10, 'kernel': 'linear'}


In [45]:
print("Random Forest:")
print(classification_report(y_test, y_pred_rf, target_names=target_names))
print("Best Parameters: ",best_params_rf)

Random Forest:
                          precision    recall  f1-score   support

           comp.graphics       0.34      0.41      0.37       187
comp.sys.ibm.pc.hardware       0.44      0.49      0.46       201
   comp.sys.mac.hardware       0.36      0.38      0.37       202
               rec.autos       0.39      0.36      0.37       215
         rec.motorcycles       0.49      0.46      0.47       221
               sci.space       0.52      0.41      0.46       217
      talk.religion.misc       0.55      0.57      0.56       127

                accuracy                           0.43      1370
               macro avg       0.44      0.44      0.44      1370
            weighted avg       0.44      0.43      0.43      1370

Best Parameters:  {'n_estimators': 300}
