In [1]:
import pandas as pd
import re
import spacy
from sklearn import datasets

In [2]:
traindata = datasets.fetch_20newsgroups(subset='train')
testdata = datasets.fetch_20newsgroups(subset='test')

In [3]:
xtrain = traindata.data
ytrain = traindata.target
cnames = traindata.target_names

In [4]:
print(len(xtrain))
print(len(ytrain))
print(cnames)

11314
11314
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [6]:
print(traindata.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      




In [8]:
print(xtrain[50])

From: johnc@crsa.bu.edu (John Collins)
Subject: Problem with MIT-SHM
Organization: Boston University
Lines: 27

I am trying to write an image display program that uses
the MIT shared memory extension.  The shared memory segment
gets allocated and attached to the process with no problem.
But the program crashes at the first call to XShmPutImage,
with the following message:

X Error of failed request:  BadShmSeg (invalid shared segment parameter)
  Major opcode of failed request:  133 (MIT-SHM)
  Minor opcode of failed request:  3 (X_ShmPutImage)
  Segment id in failed request 0x0
  Serial number of failed request:  741
  Current serial number in output stream:  742

Like I said, I did error checking on all the calls to shmget
and shmat that are necessary to create the shared memory
segment, as well as checking XShmAttach.  There are no
problems.

If anybody has had the same problem or has used MIT-SHM without
having the same problem, please let me know.

By the way, I am running OpenWin

In [9]:
print(ytrain[50])

5


In [10]:
cnames[ytrain[50]]

'comp.windows.x'

## Cleaning

In [20]:
nlp = spacy.load("en_core_web_sm") 
doc = nlp("I went to school and bought pencils with my friend and his friends with boxes happiness.")
for w in doc:
    print(w.lemma_)


-PRON-
go
to
school
and
buy
pencil
with
-PRON-
friend
and
-PRON-
friend
with
box
happiness
.


In [24]:
def regex_cleaner(xdata):
    for i in range(len(xdata)):
        doc = xdata[i]
        doc = re.sub("[a-zA-Z0-9._]+@[a-zA-Z0-9._]+","",doc) # to remove email ids
        doc = re.sub("_","",doc) # remove _
        doc = re.sub("\d[0-9A-Za-z]+","",doc) # to remove tokens starting with numbers
        xdata[i] = doc
    return xdata

def lemma_cleaner(xdata):
    for i in range(len(xdata)):
        doc = xdata[i]
        doc = nlp(doc)
        doc = " ".join([w.lemma_ for w in doc if (w.lemma_!='-PRON-' and len(w.lemma_)>3)])
        xdata[i] = doc
    return xdata

def cleaner(xdata):
    xdata = regex_cleaner(xdata)
    xdata= lemma_cleaner(xdata)
    return xdata

In [25]:
xtrain = cleaner(xtrain)

## Vectorization

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(lowercase=True,stop_words='english',min_df=5,max_df=0.95)

vec.fit(xtrain)
print(len(vec.get_feature_names()))
print(vec.get_feature_names())

16842




In [27]:
x2 = vec.transform(xtrain)
print(x2.shape)

(11314, 16842)


## Apply Machine Learning

In [28]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
# trian the model
model.fit(x2,ytrain)

MultinomialNB()

## Performance Analysis

In [34]:
from sklearn import metrics
xtest = cleaner(testdata.data)
ytest = testdata.target

In [35]:
xtest = vec.transform(xtest)
ypred = model.predict(xtest)

In [37]:
print("acccuracy ",metrics.accuracy_score(ytest,ypred))

acccuracy  0.8096123207647371


In [39]:
pd.DataFrame(metrics.confusion_matrix(ytest,ypred),columns=cnames,index=cnames)

Unnamed: 0,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
alt.atheism,190,0,1,0,0,1,1,0,0,1,1,5,1,7,5,78,8,16,1,3
comp.graphics,0,288,13,13,16,23,5,0,0,3,1,14,5,0,3,2,2,1,0,0
comp.os.ms-windows.misc,1,25,275,40,4,15,1,1,0,2,1,17,1,0,5,3,3,0,0,0
comp.sys.ibm.pc.hardware,0,6,28,297,25,2,7,1,0,0,1,2,20,0,3,0,0,0,0,0
comp.sys.mac.hardware,0,5,11,43,297,1,9,2,0,0,0,3,11,0,3,0,0,0,0,0
comp.windows.x,0,31,17,7,3,322,2,1,0,1,0,3,0,0,5,1,2,0,0,0
misc.forsale,0,2,2,23,18,0,315,13,3,1,2,0,7,1,1,1,1,0,0,0
rec.autos,0,1,1,1,0,1,10,354,9,1,5,1,7,1,2,0,2,0,0,0
rec.motorcycles,0,0,1,1,0,1,5,8,370,1,0,3,5,0,0,0,3,0,0,0
rec.sport.baseball,0,1,0,0,0,1,3,2,1,374,12,0,0,1,0,1,1,0,0,0
