# Importing all the necessary Libraries

In [None]:
from nltk.corpus import reuters 
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import spacy
import numpy as np

'''
  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
  https://spacy.io/usage/vectors-similarity
'''

'\n  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html\n  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n  https://spacy.io/usage/vectors-similarity\n'

In [None]:
import nltk
nltk.download('reuters')
!python -m spacy download en_core_web_lg

[nltk_data] Downloading package reuters to /root/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_lg')


In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
import nltk
nltk.download('punkt')
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Loading Data

In [None]:
mlb = MultiLabelBinarizer()

In [None]:
def collection_stats():
  documents = reuters.fileids()
  print(str(len(documents)) + " documents");

  train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
  print(str(len(train_docs)) + " total train documents");
 
  test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
  print(str(len(test_docs)) + " total test documents")

  categories = reuters.categories()

  print(str(len(categories)) + " categories");

In [None]:
collection_stats()

10788 documents
7769 total train documents
3019 total test documents
90 categories


# Train Test Split of Data

In [None]:
def train_test_split():
  documents = reuters.fileids()
  train_docs = [document for document in documents if document.startswith("train")]
  test_docs = [document for document in documents if document.startswith("test")]
  x_train = [reuters.raw(doc_id) for doc_id in train_docs]
  y_train = [reuters.raw(doc_id) for doc_id in test_docs]
  x_test = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs])
  y_test = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs])
  return x_train, y_train, x_test, y_test

In [None]:
x_train, x_test, y_train, y_test = train_test_split()

# Data Preprocessing

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
def clean_text(X_train):
   ret =[]
   for x_pre in X_train:
    x_pre = re.sub(r'https\S+','', x_pre)

    x_pre = re.sub('[^a-zA-Z]','', x_pre)
    x_pre = str(x_pre).lower()
    x_pre = word_tokenize(x_pre)
    x_pre = [item for item in x_pre if item not in stop_words]
    x_pre = ' '.join(x_pre)
    ret.append(x_pre)
   return ret 
 
  

In [None]:
X_train=clean_text(x_train)
X_test =clean_text(x_test)

# Word2Vec Representation

In [None]:
nlp = spacy.load("en_core_web_lg")


In [None]:
def get_word_vectors(sentence):
 
 tokens = nlp(sentence)
 vector = np.sum([token.vector for token in tokens], axis =0)
 return vector

## Generate Word2Vec embeddings for training data

In [None]:
x_train = [get_word_vectors(doc) for doc in x_train]
print(np.shape(x_train))


(7769, 300)


## Generate Word2Vec embeddings for testing data

In [None]:
x_test = [get_word_vectors(doc) for doc in x_test]
print(np.shape(x_test))

(3019, 300)


# Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

## Fit and Predict Model

In [None]:
len(x_train)

7769

In [None]:
len(y_train)

7769

In [None]:
lr = OneVsRestClassifier(LogisticRegression(solver ='newton-cg'))
lr.fit(x_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='newton-cg', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [None]:
y_pred = lr.predict(x_test)

## Classification report on testing data

In [None]:
print("Word2vec Result word on Train")
print(classification_report(y_pred=y_pred , y_true=y_test))

Word2vec Result word on Train
              precision    recall  f1-score   support

           0       0.94      0.91      0.92       719
           1       0.70      0.30      0.42        23
           2       0.65      0.79      0.71        14
           3       0.63      0.57      0.60        30
           4       0.58      0.61      0.59        18
           5       0.00      0.00      0.00         1
           6       0.94      0.89      0.91        18
           7       1.00      0.50      0.67         2
           8       0.00      0.00      0.00         3
           9       0.79      0.96      0.87        28
          10       0.86      0.67      0.75        18
          11       0.00      0.00      0.00         1
          12       0.64      0.79      0.70        56
          13       0.56      0.45      0.50        20
          14       0.00      0.00      0.00         2
          15       0.66      0.68      0.67        28
          16       0.00      0.00      0.00        

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Naive Bayes Classifier

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

## Fit and Predict Model

In [None]:
gnb = OneVsRestClassifier(GaussianNB())
gnb.fit(x_train, y_train)

OneVsRestClassifier(estimator=GaussianNB(priors=None, var_smoothing=1e-09),
                    n_jobs=None)

In [None]:
y_pred = gnb.predict(x_test)

## Classification report on testing data

In [None]:
print("Naive Bayes Classifier Result word on Train")
#print(classification_report(nbClassifier.predict(X_train) , y_train))
print(classification_report(y_pred = y_pred , y_true= y_test))

Naive Bayes Classifier Result word on Train
              precision    recall  f1-score   support

           0       0.32      0.91      0.47       719
           1       0.02      0.61      0.05        23
           2       0.01      0.79      0.02        14
           3       0.04      0.47      0.08        30
           4       0.06      0.72      0.10        18
           5       0.00      0.00      0.00         1
           6       0.02      0.28      0.03        18
           7       0.00      0.00      0.00         2
           8       0.02      0.67      0.05         3
           9       0.03      0.32      0.05        28
          10       0.04      0.67      0.07        18
          11       0.00      0.00      0.00         1
          12       0.05      0.36      0.09        56
          13       0.04      0.55      0.08        20
          14       0.00      0.00      0.00         2
          15       0.07      0.54      0.12        28
          16       1.00      1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
