# Importing all the necessary libraries

In [5]:
from nltk.corpus import reuters 
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import numpy as np

'''
  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html
  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
  https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
'''

'\n  https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html\n  https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html\n  https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html\n'

In [6]:
import nltk
nltk.download('reuters')

[nltk_data] Downloading package reuters to /root/nltk_data...


True

In [7]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
import nltk
nltk.download('punkt')
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Loading Data

In [9]:
mlb = MultiLabelBinarizer()

In [10]:
def collection_stats():
  documents = reuters.fileids()
  print(str(len(documents)) + " documents");

  train_docs = list(filter(lambda doc: doc.startswith("train"), documents));
  print(str(len(train_docs)) + " total train documents");
 
  test_docs = list(filter(lambda doc: doc.startswith("test"), documents));
  print(str(len(test_docs)) + " total test documents")

  categories = reuters.categories()

  print(str(len(categories)) + " categories");

In [11]:
collection_stats()

10788 documents
7769 total train documents
3019 total test documents
90 categories


# Train Test Split of Data

In [12]:
def train_test_split():
  documents = reuters.fileids()
  train_docs = [document for document in documents if document.startswith("train")]
  test_docs = [document for document in documents if document.startswith("test")]
  x_train = [reuters.raw(doc_id) for doc_id in train_docs]
  y_train = [reuters.raw(doc_id) for doc_id in test_docs]
  x_test = mlb.fit_transform([reuters.categories(doc_id) for doc_id in train_docs])
  y_test = mlb.transform([reuters.categories(doc_id) for doc_id in test_docs])
  return x_train, y_train, x_test, y_test

In [13]:
x_train, x_test, y_train, y_test = train_test_split()

# Data Preprocessing

In [14]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [15]:
stop_words = set(stopwords.words('english'))

In [62]:
def clean_text(X_train):
  """ function to clean data , input should a list of data like list of training data or test data and it will return cleaned data"""
  ret =[]
  for x_pre in X_train:
    x_pre = re.sub(r'https\S+','', x_pre)

    x_pre = re.sub('[^a-zA-Z]','', x_pre)
    x_pre = str(x_pre).lower()
    x_pre = word_tokenize(x_pre)
    x_pre = [item for item in x_pre if item not in stop_words]
    x_pre = ' '.join(x_pre)
    ret.append(x_pre)
  return ret 
 
 # train_test_split['sentence_clean'] = train_test_split['text'].str.replace('[{}]'.format(string.punctuation), '')
  #train_test_split['sentence_clean'] = train_test_split['sentence_clean'].str.lower()
 # train_data_senti=train_test_split
  
#printing text with sentence_clean
#print(train_test_split)
  

In [63]:
X_train=clean_text(x_train)
X_test =clean_text(x_test)
print(X_test)



In [74]:
print(X_test[0])
print(X_train[0])

bahiacocoareviewshowerscontinuedthroughouttheweekinthebahiacocoazonealleviatingthedroughtsinceearlyjanuaryandimprovingprospectsforthecomingtemporaoalthoughnormalhumiditylevelshavenotbeenrestoredcomissariasmithsaidinitsweeklyreviewthedryperiodmeansthetemporaowillbelatethisyeararrivalsfortheweekendedfebruarywerebagsofkilosmakingacumulativetotalfortheseasonofmlnagainstatthesamestagelastyearagainitseemsthatcocoadeliveredearlieronconsignmentwasincludedinthearrivalsfigurescomissariasmithsaidthereisstillsomedoubtastohowmucholdcropcocoaisstillavailableasharvestinghaspracticallycometoanendwithtotalbahiacropestimatesaroundmlnbagsandsalesstandingatalmostmlnthereareafewhundredthousandbagsstillinthehandsoffarmersmiddlemenexportersandprocessorstherearedoubtsastohowmuchofthiscocoawouldbefitforexportasshippersarenowexperiencingdificultiesinobtainingbahiasuperiorcertificatesinviewofthelowerqualityoverrecentweeksfarmershavesoldagoodpartoftheircocoaheldonconsignmentcomissariasmithsaidspotbeanpricesroseto

# Building TF-IDF representation of text

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [65]:
# generate TFIDF representation of the text using the above imported function

vectorizer = TfidfVectorizer()

x_tf_train = vectorizer.fit_transform(X_train)

x_tf_test = vectorizer.transform(X_test)

In [73]:
print("The shape of X_train term document matrix is:", x_tf_train.shape)
print("The shape of X_test term document matrix is:", x_tf_test.shape)

The shape of X_train term document matrix is: (7769, 7589)
The shape of X_test term document matrix is: (3019, 7589)


# Logistic Regression

In [66]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

## Fit and Predict Model

In [67]:
lr = LogisticRegression()
ovr = OneVsRestClassifier(lr)

In [68]:
ovr.fit(x_tf_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='auto',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=None,
                                                 solver='lbfgs', tol=0.0001,
                                                 verbose=0, warm_start=False),
                    n_jobs=None)

In [70]:
y_pred= ovr.predict(x_tf_train)
print(y_pred)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [72]:
print(y_pred[0])
print(y_test[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]


## Classification report on training data

In [None]:
print("Logistic Regression Result word on Train")
print(classification_report( ovr.predict(x_tf_train) , y_train))

## Classification Report on testing data

In [None]:
print("Logistic Regression Result word on Test")
print(classification_report( ovr.predict(x_tf_test) , y_test))

# Naive Bayes Classifier

In [76]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.multiclass import OneVsRestClassifier

## Fit and Predict Model

In [77]:
nbClassifier =OneVsRestClassifier(MultinomialNB())
nbClassifier.fit(x_tf_train, y_train)


OneVsRestClassifier(estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                            fit_prior=True),
                    n_jobs=None)

In [78]:
y_pred = nbClassifier.predict(x_tf_test)

In [79]:
print(y_pred[0])
print(y_test[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]


## Classification Report on training data

In [80]:
print("Naive Bayes Classifier Result word on Train")
print(classification_report(nbClassifier.predict(x_tf_train) , y_train))

Naive Bayes Classifier Result word on Train
              precision    recall  f1-score   support

           0       0.01      1.00      0.01        10
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Classification report on testing data

In [81]:
print("Naive Bayes Classifier Result word on Test")
print(classification_report(nbClassifier.predict(x_tf_test) , y_test))

Naive Bayes Classifier Result word on Test
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         0
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         0
           5       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         0
           7       0.00      0.00      0.00         0
           8       0.00      0.00      0.00         0
           9       0.00      0.00      0.00         0
          10       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         0
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          16       0.00      0.00     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
