# Email Classification using 20 newsgroup dataset

In [3]:
from sklearn import datasets
traindata= datasets.fetch_20newsgroups(subset='train')
testdata= datasets.fetch_20newsgroups(subset='test')

In [49]:
xtrain = traindata.data
ytrain = traindata.target
print(len(xtrain))
print(len(ytrain))

11314
11314


In [5]:
print(len(testdata.data))

7532


In [6]:
print(traindata.DESCR)

.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`~sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1
    Features      

In [7]:
target_names = traindata.target_names
print(target_names)

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [8]:
print(xtrain[0])

From: lerxst@wam.umd.edu (where's my thing)
Subject: WHAT car is this!?
Nntp-Posting-Host: rac3.wam.umd.edu
Organization: University of Maryland, College Park
Lines: 15

 I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.

Thanks,
- IL
   ---- brought to you by your neighborhood Lerxst ----







In [9]:
print(ytrain[0])

7


In [10]:
target_names[ytrain[0]]

'rec.autos'

In [11]:
import re
import spacy

In [47]:
nlp = spacy.load('en')
doc = nlp("I went to bring tables for my children and his boxes with knives. Mr. john he worked very hard in from the of here.")

In [48]:
for w in doc:
    if w.lemma_!="-PRON-":
        print(w.lemma_)

go
to
bring
table
for
child
and
box
with
knife
.
Mr.
john
work
very
hard
in
from
the
of
here
.


## Cleaning

In [51]:
def regex_cleaning(xdata):
    for i in range(len(xdata)):
        doc = xdata[i]
        doc = re.sub("[a-zA-Z0-9._]+@[a-zA-Z0-9._]+","",doc) # removing email ids
        doc = re.sub("\d[0-9a-zA-Z_]+","",doc) # all words starting with numbers should be dropped
        doc = re.sub("_","",doc)
        xdata[i] = doc
    return xdata
        

def lemma_cleaning(xdata):
    for i in range(len(xdata)):
        doc = xdata[i]
        doc = nlp(doc)
        doc = " ".join([w.lemma_ for w in doc if (w.lemma_!='-PRON-' and len(w.lemma_)>3])
        xdata[i] = doc
    return xdata
        
def transform(xdata):
    xdata = regex_cleaning(xdata)
    xdata = lemma_cleaning(xdata)
    return xdata

In [52]:
xtrain = transform(xtrain)

## Vectorization

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer(lowercase=True,stop_words='english',min_df=5,max_df=0.95)
vec.fit(xtrain)

TfidfVectorizer(max_df=0.95, min_df=5, stop_words='english')

In [54]:
print(len(vec.get_feature_names()))
print(vec.get_feature_names())

17973




In [56]:
# get the vectorized data
x2 = vec.transform(xtrain).toarray()
print(x2.shape)

(11314, 17973)


## Apply ML - Naive Bayes

In [57]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [59]:
# trian the model 
model.fit(x2,ytrain)

MultinomialNB()

## Performance Analysis

In [60]:
xtest = transform(testdata.data)
ytest = testdata.target

In [63]:
xtest = vec.transform(xtest)

In [64]:
ypred = model.predict(xtest)

from sklearn import metrics
print("Accuracy ",metrics.accuracy_score(ytest,ypred))

Accuracy  0.8177110993096123


In [66]:
import pandas as pd
pd.DataFrame(metrics.confusion_matrix(ytest,ypred),columns=target_names,index=target_names)

Unnamed: 0,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,rec.sport.baseball,rec.sport.hockey,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc
alt.atheism,187,0,1,1,0,1,0,0,1,1,1,4,1,7,5,81,8,15,2,3
comp.graphics,0,290,13,14,12,22,3,0,0,3,0,15,10,0,4,1,1,1,0,0
comp.os.ms-windows.misc,1,24,297,38,7,10,1,0,0,2,0,6,1,0,2,2,3,0,0,0
comp.sys.ibm.pc.hardware,0,7,27,300,23,3,10,2,0,1,1,3,13,0,2,0,0,0,0,0
comp.sys.mac.hardware,0,5,7,18,328,0,7,2,0,1,0,2,8,1,4,0,2,0,0,0
comp.windows.x,0,36,23,10,2,317,1,0,0,0,0,2,0,0,4,0,0,0,0,0
misc.forsale,0,2,4,31,16,1,307,13,4,0,1,0,6,1,1,2,1,0,0,0
rec.autos,0,2,1,2,0,1,5,362,6,0,2,1,7,0,3,0,4,0,0,0
rec.motorcycles,0,0,1,1,1,1,2,9,375,3,0,0,4,0,0,0,1,0,0,0
rec.sport.baseball,0,0,0,0,1,0,3,0,1,378,12,0,0,0,0,1,1,0,0,0
