In [1]:
import sklearn
from sklearn.datasets import fetch_20newsgroups                        # for data set
from sklearn.model_selection import train_test_split                   # for train test splitting
from sklearn.feature_extraction.text import TfidfVectorizer            # for Vectorization
import pandas as pd                                                    # for data analysis
import numpy as np                                                     # for numerical calculation
import string                                                          # to convert string
from nltk.corpus import stopwords                                      # for stopwords
from sklearn.metrics import classification_report,confusion_matrix     # for Classification report and confusion matrix
from sklearn.naive_bayes import MultinomialNB                          # model

import nltk
nltk.download('stopwords')

import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Task 1: Data Selection

Task 1.1: Load only two categories ('rec.motorcycles' and ' sci.electronics')

In [2]:
from sklearn.datasets import fetch_20newsgroups
task1 = fetch_20newsgroups(subset='all', categories=['rec.motorcycles','sci.electronics'])
task1

{'data': ['Subject: Re: Power, signal surges in home...\nFrom: emd@ham.almanac.bc.ca\nDistribution: world\nOrganization: Robert Smits\nLines: 48\n\nclewis@ferret.ocunix.on.ca (Chris Lewis) writes:\n\n> In article <1r1jmoINN8mb@rave.larc.nasa.gov> kludge@grissom.larc.nasa.gov (Sc\n> >In article <DRAND.93Apr20150701@spinner.osf.org> drand@spinner.osf.org (Doug\n> >>In article <randall.735251839@woof> randall@informix.com (Randall Rhea) wri\n> \n> >>   Hams can legally run up to 1500 watts.  It is very unlikely, however,\n> >>   that a ham would be running that kind of power from a car.  Ham rigs\n> \n> >>Not possible either.  You\'d need about a 300 amp alternator for\n> >>just the amplifier.  I can just see it.  You need to slow\n> >>down on a downgrade,  so you hit the push to talk button.\n> \n> >Now, that indeed is possible.  A good friend of mine is running about 1 KW\n> >PeP from his car.  Yes, he does have a second alternator.  Yes, he calls\n> >the rig an "electronic brake" since

In [3]:
x=pd.DataFrame({'X':task1.data})                  # independent variable
y=pd.DataFrame({'Y':task1.target})                # target variable

In [4]:
def text_process(text):
    nopunch=[char for char in text if char not in string.punctuation]
    nopunc=''. join(nopunch)
    return[word for word in nopunc.split() if word.lower() not in stopwords .words('english')]

In [5]:
def list_to_string(s):                          # converting list to string
    str1 = " "
    return (str1.join(s))

In [6]:
x=x['X'].apply(text_process)                   # applying text process

In [7]:
x=x.apply(list_to_string)                      # converting list to string

In [8]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42) # spliting train and test data

In [9]:
x_train

1067    svobodartsgmotcom David Svoboda Subject Ok lit...
905     derryNeXTworkRoseHulmanEdu John Derry Subject ...
1446    jimbescbnewsjcbattcom jamesbessette Subject Go...
1621    coynething1ccutexasedu Subject Radar detector ...
1571    blevehoggle2uucp Bennett Lee Leve Subject Chok...
                              ...                        
1130    gaijinaleJapanSunCOM John Little Nihon Sun Rep...
1294    mcarraustinonuedu Michael Carr x1723 Subject D...
860     jeqlachmancom Jonathan E Quist Subject Dogs vs...
1459    behannasylnjneccom Chris BeHanna Subject GOT B...
1126    jimctaucetiiscbrcom Jim Cathey Subject video q...
Name: X, Length: 1386, dtype: object

In [10]:
x_test

1428    rgc3679bcsteccaboeingcom Robert G Carpenter Su...
56      timdfeniandellcom Tim Deagan Subject Homebuilt...
351     dpcec1wustledu David Prutchi Subject OEM weigh...
1681    lcgoannacsrmitozau leon Subject 2 silver solde...
1730    bandycatnipberkeleycaus Andrew Scott Beals KC6...
                              ...                        
1958    tasfermiclasVirginiaEDU Thomas Spraggins Subje...
1271    corwinigcapcorg Corwin Nichols Subject Fujitsu...
1827    mljaf3mlbsemiharriscom Marvin Jaster Subject S...
286     srlterminusericssonse Steve Langstaff Subject ...
1588    rmugeleoraclecom Robert Mugele Subject act fro...
Name: X, Length: 594, dtype: object

## Task 2:
Vectorization and Building the Classifier Model

2.1) Convert the text data into vector form using the TF-IDF vectorization method

In [11]:
def vectorization(xtrain,xtest,ytrain,ytest):                          # user def function
    # Convert the text data into vector form using the TF-IDF vectorization method
    global myvector                                                    # globe function to use later
    myvector= TfidfVectorizer()                                        # vectorization
    myobject = myvector.fit_transform(xtrain)                          # fitting train data
    myobject1 = myvector.transform(xtest)                              # transforming data
    print(myobject.toarray())                                          # vector in array

    # Use multinominalNB()
    global clf                                                         # globe function to use predict later
    clf = MultinomialNB()                                              # model
    clf=clf.fit(myobject,ytrain)                                       # fitting model
    y_pred = clf.predict(myobject1)                                    # preding test
    print(y_pred)                                                      # predicting predicted value

    # confusion matrix
    print("confusion matrix :\n",confusion_matrix(ytest[0:300],y_pred[0:300]))

    # classification report
    print("Classification report :\n",classification_report(ytest[0:300],y_pred[0:300]))

    # Return first 30 sample of predicted and actual output
    data = {"Predicted":ytest[:30].values.flatten(), "Actual":y_pred[:30].tolist()}
    df_out=pd.DataFrame(data)

    # returning data frame
    return df_out

In [12]:
vectorization(x_train,x_test,y_train,y_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[1 1 1 1 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0
 1 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 1 0 0 1 1 1 0 1
 0 0 0 0 0 1 1 1 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 1 1 1 0 0 0 0 1 1 0 0 1
 0 0 1 0 0 1 0 1 0 1 0 1 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 1
 1 0 1 1 1 0 1 1 1 0 0 1 1 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 1 1 1 1 1 1 0 1 1
 0 0 1 1 1 0 1 0 1 1 0 1 1 1 0 1 1 0 1 1 1 0 1 0 1 1 1 1 0 0 1 1 1 0 0 0 0
 0 1 0 1 1 0 1 0 1 1 0 1 1 1 1 0 1 1 1 0 1 1 1 1 0 1 0 1 0 1 0 0 0 0 1 0 0
 1 1 0 1 0 1 1 1 1 1 0 1 1 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 0 0 0 1 1 1 0 0 0
 0 0 0 1 0 1 1 1 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1
 1 0 0 1 1 0 0 0 1 1 0 0 1 1 1 1 0 0 0 0 1 0 1 0 1 1 0 1 1 1 0 0 1 1 0 0 1
 1 1 0 0 0 0 1 1 1 0 0 1 1 0 0 0 1 1 1 1 0 1 1 0 1 0 0 1 1 0 0 1 1 1 1 1 0
 1 0 0 0 1 1 0 0 1 

Unnamed: 0,Predicted,Actual
0,1,1
1,1,1
2,1,1
3,1,1
4,0,0
5,0,0
6,1,1
7,0,0
8,0,0
9,0,0


## Task 3:
Classification on Additional Categories

In [22]:
task2 = fetch_20newsgroups(subset='all',categories=['rec.sport.baseball','comp.graphics','rec.motorcycles','sci.electronics']) # importing different dataset
task2

{'data': ["From: jussi@tor.abo.fi (Jussi Laaksonen DC)\nSubject: Lasergraphics Language ?\nOrganization: ]bo Akademi University, Finland\nDistribution: comp.graphics\nLines: 25\n\nHi!\n\nWe have an old Montage FR-1 35mm film recorder. When connected to a PC with\nits processor card it can directly take HPGL, Targa and Lasergraphics Language\nfiles. 24 bit Targa is quite OK for raster images, but conversion from \nwhatever one happens to have can be quite slow. This Lasergraphics Language\nseems to be (got the source file for one test image) a vector-based language\nthat can handle one million colors. It does some polygons too, and perhaps\nsomething else ?\n\nThe question is, where can I find some information about this language ?\nA FTP site, a book, a company address,.... ?\n\n(OK, it would be nice to have a Windows driver for it, but I'm not THAT\noptimistic...)\n\nThanks in advance for any help!\n\n\tjussi\n\n\n--\n\tJussi Laaksonen\n        Computing Centre / ]bo Akademi Universit

In [23]:
x1=pd.DataFrame({'X':task2.data})                           # input data
y1=pd.DataFrame({'Y':task2.target})                         # target variable

In [24]:
x1

Unnamed: 0,X
0,From: jussi@tor.abo.fi (Jussi Laaksonen DC)\nS...
1,From: rws@cs.arizona.edu (Ronald W. Schmidt)\n...
2,From: sburton@dres.dnd.ca (Stan Burton)\nSubje...
3,From: cjackson@adobe.com (Curtis Jackson)\nSub...
4,From: warren@eggo.csee.usf.edu (Warren Gaiewsk...
...,...
3942,From: paula@koufax.cv.hp.com (Paul Andresen)\n...
3943,From: kennu@mits.mdata.fi (Kenneth Falck)\nSub...
3944,From: jbulf@balsa.Berkeley.EDU (Jeff Bulf)\nSu...
3945,From: mike@netnews.louisville.edu (Mike Harpe)...


In [25]:
y1

Unnamed: 0,Y
0,0
1,0
2,3
3,1
4,3
...,...
3942,2
3943,0
3944,0
3945,3


In [26]:
x1=x1['X'].apply(text_process)                      # applying preprocessing test

In [27]:
x1=x1.apply(list_to_string)                         # converting list to string

In [28]:
x1

0       jussitorabofi Jussi Laaksonen DC Subject Laser...
1       rwscsarizonaedu Ronald W Schmidt Subject outli...
2       sburtondresdndca Stan Burton Subject Long dist...
3       cjacksonadobecom Curtis Jackson Subject Counte...
4       warreneggocseeusfedu Warren Gaiewski Subject D...
                              ...                        
3942    paulakoufaxcvhpcom Paul Andresen Subject Brave...
3943    kennumitsmdatafi Kenneth Falck Subject CView a...
3944    jbulfbalsaBerkeleyEDU Jeff Bulf Subject Fracta...
3945    mikenetnewslouisvilleedu Mike Harpe Subject dr...
3946    barringcswashingtonedu David Barrington Subjec...
Name: X, Length: 3947, dtype: object

In [29]:
x1_train,x1_test,y1_train,y1_test = train_test_split(x1,y1,test_size=.20)            # splitting

In [30]:
vectorization(x1_train,x1_test,y1_train,y1_test)                                    # calling function

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0 1 3 0 1 1 3 0 0 3 0 1 0 3 3 0 0 2 2 0 2 2 1 1 0 0 0 2 3 2 1 2 2 3 0 1 0
 0 2 1 2 2 0 1 2 1 0 1 3 3 1 1 0 0 2 2 2 2 3 3 1 0 3 3 3 3 3 1 1 3 1 2 0 2
 0 1 2 3 1 0 1 3 2 0 2 2 2 0 1 1 1 0 2 2 3 1 3 2 1 0 1 1 3 2 3 1 0 3 0 1 1
 0 0 0 3 1 2 1 1 1 1 3 1 3 0 0 1 0 0 3 2 0 1 2 2 0 2 2 2 2 1 2 2 1 1 0 2 1
 3 1 0 0 2 2 2 3 3 3 3 2 0 1 1 2 1 3 0 3 3 3 3 0 3 2 1 1 2 0 2 1 2 0 2 0 2
 0 0 3 0 1 0 3 0 2 3 2 1 2 3 1 0 0 3 0 0 2 0 3 0 3 0 3 2 3 3 2 3 1 0 1 1 2
 3 3 1 1 0 0 1 2 3 1 0 0 3 1 0 2 0 0 2 2 1 2 0 0 1 1 2 3 3 0 3 3 2 1 0 2 3
 2 2 0 1 3 1 3 3 3 3 1 3 3 1 3 0 3 3 3 1 0 0 2 1 2 1 2 1 0 0 1 0 2 3 0 0 1
 0 0 2 0 0 0 3 2 0 1 1 1 3 1 2 0 1 1 2 1 2 2 3 1 0 0 0 1 3 2 1 0 2 1 3 1 3
 3 2 1 2 0 3 0 2 0 3 3 2 0 0 2 3 2 0 0 3 3 2 3 3 2 2 1 3 2 1 0 1 3 0 2 1 0
 3 2 1 3 0 3 0 2 1 2 2 2 0 3 0 2 1 3 2 3 1 3 3 3 2 1 1 3 2 2 3 1 2 0 2 1 3
 3 1 1 1 0 1 2 2 2 

Unnamed: 0,Predicted,Actual
0,0,0
1,1,1
2,3,3
3,0,0
4,1,1
5,1,1
6,3,3
7,0,0
8,0,0
9,3,3


## Task 4 :
Predction on unseen data

In [32]:
def new_data(txt):                              # predicting new sentences
    text = myvector.transform([txt])            # fit the sentences
    return clf.predict(text)                    # returning predicted value

In [33]:
new_data("Where are the different parts of a computer ?")

array([3])

In [34]:
new_data("Playing baseball is good for one's health")

array([2])

In [35]:
new_data("In Which games are you interested")

array([2])

In [36]:
new_data("It is the unknown around the corner that turns my wheels.")

array([1])

In [37]:
new_data("I am interested in increasing the picture resolution of my computer")

array([0])

In [38]:
new_data("The team might not win if there is rain")

array([2])

Conclusion: Here we can see the four categories of data i.e 0,1,2 and 3, based on that we have evaluated the new text data.
0: comp.graphics
1: rec.motorcycles
2: rec.sport.baseball
3: sci.electronics