In [1]:
# word Embeddings: 1. one-hot 2. Frequency-based 3. Prediction-based (using machine learning - semantic relationship)

# 2. Frequency based embeding:  
#   2.1.Count (it count the number of times which a single word occurred in a document) 
#   2.2.TF-IDF (how often a word occurs in the document and as well as the entire corpus) - it excludes the stop words!
#   2.3.Co-occurrence (similar words occur together and will have similar context) how often two words occurred in a context window
    

In [25]:
import sklearn
import pandas as pd
import numpy as np

from sklearn.datasets import fetch_20newsgroups

# all machine learning/deep learning models only can work with numeric data (not with text data) 
# so we need to convert text data to numeric data using some techniques .. so.. 
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split

from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score

In [3]:
newsgroups_data = fetch_20newsgroups()

In [4]:
newsgroups_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print(newsgroups_data.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features       

In [6]:
newsgroups_data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [7]:
print(newsgroups_data.data[4])

From: jcm@head-cfa.harvard.edu (Jonathan McDowell)
Subject: Re: Shuttle Launch Question
Organization: Smithsonian Astrophysical Observatory, Cambridge, MA,  USA
Distribution: sci
Lines: 23

From article <C5owCB.n3p@world.std.com>, by tombaker@world.std.com (Tom A Baker):
>>In article <C5JLwx.4H9.1@cs.cmu.edu>, ETRAT@ttacs1.ttu.edu (Pack Rat) writes...
>>>errors. ...".  I am wondering what an "expected error" might
>>>be.  Sorry if this is a really dumb question, but
> 
> Parity errors in memory or previously known conditions that were waivered.
>    "Yes that is an error, but we already knew about it"
> I'd be curious as to what the real meaning of the quote is.
> 
> tom


My understanding is that the 'expected errors' are basically
that don't have the right values in yet because they aren't
set till after launch, and suchlike. Rather than fix the code
and possibly introduce new bugs, they just tell the crew

 - Jonathan





In [8]:
print(newsgroups_data.target[4]) #14 is  'sci.space',

14


In [9]:
np.unique(newsgroups_data.target)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19])

In [10]:
len(newsgroups_data.data)

11314

In [11]:
tfidf_vec = TfidfVectorizer(stop_words='english')

newsgroupdata_transfommed = tfidf_vec.fit_transform(newsgroups_data.data)

In [12]:
newsgroupdata_transfommed.shape

(11314, 129796)

In [13]:
# the 3rd column is the tfidf score for each word
print(newsgroupdata_transfommed[0])

  (0, 75215)	0.38538985156422345
  (0, 122887)	0.282869751755441
  (0, 118013)	0.23076236589534987
  (0, 50455)	0.05948476266845307
  (0, 114439)	0.06768238878777005
  (0, 111094)	0.020865105019220037
  (0, 37722)	0.41534653529092685
  (0, 87451)	0.03885306291479392
  (0, 94962)	0.03754552571724598
  (0, 63970)	0.03857974543636419
  (0, 98748)	0.17501596694257227
  (0, 90192)	0.021706106200820422
  (0, 118714)	0.04039328791909072
  (0, 79519)	0.11911704310036365
  (0, 40939)	0.08497090499024601
  (0, 91885)	0.10797335594250271
  (0, 75888)	0.020933445618156278
  (0, 4605)	0.06897342558445459
  (0, 124627)	0.0967471326603278
  (0, 51714)	0.1460907895102532
  (0, 104609)	0.09217540920934718
  (0, 45232)	0.07212208178051426
  (0, 48550)	0.10908149802523068
  (0, 109354)	0.1177321203161709
  (0, 76574)	0.09842306773884468
  :	:
  (0, 34943)	0.18203649549572576
  (0, 48552)	0.12638449885516734
  (0, 99619)	0.061719030928680974
  (0, 108033)	0.08197182211166718
  (0, 26070)	0.103851851395033

In [14]:
# train_test_split(X, y, test_size=0.2)
X_train, X_test, y_train, y_test = train_test_split(newsgroupdata_transfommed,
                                                    newsgroups_data.target,
                                                    shuffle=True,
                                                    test_size=0.2)

In [15]:
X_train.shape, y_train.shape

((9051, 129796), (9051,))

In [16]:
X_test.shape, y_test.shape

((2263, 129796), (2263,))

In [18]:
# optimizer = adam
mlp_clf = MLPClassifier(activation='relu', 
                        hidden_layer_sizes=(20,), 
                        solver='adam',
                        verbose=True,
                        max_iter=100)

In [19]:
mlp_clf.fit(X_train, y_train)

Iteration 1, loss = 2.96108882
Iteration 2, loss = 2.70007309
Iteration 3, loss = 2.32476911
Iteration 4, loss = 1.88131041
Iteration 5, loss = 1.44466976
Iteration 6, loss = 1.07621957
Iteration 7, loss = 0.79548710
Iteration 8, loss = 0.59358966
Iteration 9, loss = 0.45109992
Iteration 10, loss = 0.34967098
Iteration 11, loss = 0.27658837
Iteration 12, loss = 0.22248038
Iteration 13, loss = 0.18196666
Iteration 14, loss = 0.15085376
Iteration 15, loss = 0.12679056
Iteration 16, loss = 0.10784638
Iteration 17, loss = 0.09274448
Iteration 18, loss = 0.08052683
Iteration 19, loss = 0.07063890
Iteration 20, loss = 0.06249713
Iteration 21, loss = 0.05572064
Iteration 22, loss = 0.05007822
Iteration 23, loss = 0.04531165
Iteration 24, loss = 0.04119543
Iteration 25, loss = 0.03776484
Iteration 26, loss = 0.03473826
Iteration 27, loss = 0.03216525
Iteration 28, loss = 0.02984373
Iteration 29, loss = 0.02786675
Iteration 30, loss = 0.02609752
Iteration 31, loss = 0.02454092
Iteration 32, los

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=100, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=True, warm_start=False)

In [22]:
y_pred = mlp_clf.predict(X_test)

In [24]:
pred_results = pd.DataFrame({'y_test' : y_test,
                             'y_pred' : y_pred})

pred_results.sample(10)

Unnamed: 0,y_test,y_pred
2105,2,2
1520,19,19
1650,16,16
1534,17,17
645,13,13
1632,14,14
821,6,6
1637,11,11
2177,3,3
1879,8,8


In [27]:
accuracy_score(y_test, y_pred)

0.9231109147149801