In [16]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups


categories=['talk.politics.mideast', 'rec.sport.hockey','comp.graphics', 'sci.med', 'sci.space', 'comp.sys.ibm.pc.hardware', 'alt.atheism']
df=fetch_20newsgroups(subset='train',shuffle=True,random_state=42)
print(len(df.target_names))
print(df.target_names)

20
['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [2]:
print(len(df.data))

11314


In [4]:
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer()
xtrain_count=cv.fit_transform(df.data)
print(xtrain_count.shape)

(11314, 130107)


In [5]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf=TfidfTransformer()
xtrain_tf=tfidf.fit_transform(xtrain_count)
print(xtrain_tf.shape)

(11314, 130107)


In [9]:
from sklearn.naive_bayes import MultinomialNB

clf=MultinomialNB().fit(xtrain_tf,df.target)
df_test=fetch_20newsgroups(subset='test',shuffle=True,random_state=42)

# print(df_test.data[0])

# naive bayes:

In [14]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
text_clf1 = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('clf', MultinomialNB()),])
text_clf.fit(df.data,df.target)
text_clf1.fit(df.data,df.target)
pred=text_clf.predict(df_test.data)
pred1=text_clf1.predict(df_test.data)

print(np.mean(pred==df_test.target))
print(np.mean(pred1==df_test.target))

0.7738980350504514
0.8169144981412639


# multilayer perceptron:

In [11]:
from sklearn.neural_network import MLPClassifier

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('clf', MLPClassifier(solver='lbfgs',alpha=1e-5,hidden_layer_sizes=(15,),random_state=1)),])

text_clf.fit(df.data,df.target)
pred=text_clf.predict(df_test.data)

print(np.mean(pred==df_test.target))

0.7351301115241635


# logistic regression:

In [12]:
from sklearn.linear_model import LogisticRegression

text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('clf', LogisticRegression()),])

text_clf.fit(df.data,df.target)
pred=text_clf.predict(df_test.data)

print(np.mean(pred==df_test.target))

0.8297928836962294


# svm model:

In [15]:
from sklearn.svm import LinearSVC

text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf', LinearSVC()),])
text_clf1 = Pipeline([('vect', CountVectorizer(stop_words='english')),('tfidf', TfidfTransformer()),('clf', LinearSVC()),])

text_clf.fit(df.data,df.target)
text_clf1.fit(df.data,df.target)
pred=text_clf.predict(df_test.data)
pred1=text_clf1.predict(df_test.data)

print(np.mean(pred==df_test.target))
print(np.mean(pred1==df_test.target))

0.8531598513011153
0.851035581518853


# example to test classification:

In [20]:
te=[]
te.append("""From: stark@dwovax.enet.dec.com (Todd I. Stark)
Subject: Re: Krillean Photography
Organization: Digital Equipment Corporation
Lines: 52
Distribution: world
NNTP-Posting-Host: DWOVAX


In article <1rjr1uINNh8@gap.caltech.edu>, carl@SOL1.GPS.CALTECH.EDU (Carl J Lydick) writes...
>In article <1993Apr26.204319.11231@ultb.isc.rit.edu>, eas3714@ultb.isc.rit.edu (E.A. Story) writes:
>=In article <1rgrsvINNmpr@gap.caltech.edu> carl@SOL1.GPS.CALTECH.EDU writes:
>=>Greg:Flame definitely intended here.  Bill was making fun of the misspelling. 
>=>Go look up the word "krill."  Also, the correct spelling is Kirlian.  It
>=>involves taking photographs of corona discharges created by attaching the
>=>subject to a high-voltage source, not of some "aura."  It works equally well
>=>with inanimate objects.
>=
>=True.. but what about showing the missing part of a leaf?  Is this
>="corona discharge"?
> 
>Yup.  The demonstration to which you refer consists of placing a leaf between
>the plates, and taking a Kirlian photograph of it.  You then cut off part of
>the leaf, put the top plate back on, and take another Kirlian photograph.  You
>see pretty much the same image in both cases.  Turns out the effect isn't
>nearly so striking if you take the trouble to clean the plates between
>photographs.  Seems that the moisture from the leaf that you left on the place
>conducts electricity.  Surprise, surprise!

	This is true, but it's not quite the whole story.  There were 
	actually some people who were more careful in their methodology
	who also replicated the 'phantom leaf effect.'

    One of the most influential critics of Kirlian Electrophotography
    is a Theosophist (and threfore presumably willing to entertain the
    hypothesis of scientific evidence for a human aura, electromagnetic
    or otherwise), professor of electrical engineering at London's
    City University, and a past president of the Society for Psychic Research 
    named A. J. Ellison.

    After years of studying the method and the claims, Ellison
    came to the conclusion that the photographic images are what we
    calls 'Lichtenberg Figures,' an effect of intermittent ionization of
    the air around the object.  It's a bit more complicated than
    'not wiping off the plates,' but it comes down to the same thing
    in the end, Kirlian electrophotography has much more limited
    value (if any) than was previously widely thought.  Electrical and
    magnetic fields generated by the body are much too small to be
    of much use diagnostically without very elaborate equipment and
    usually also tracer chemicals.

					kind regards,

					todd
+-----------------------------------------------------------------------------+
| Todd I. Stark				  stark@dwovax.enet.dec.com           |
| Digital Equipment Corporation		             (215) 542-3573           |
| Philadelphia, Pa. USA                                                       |
|    "(A word is) the skin of a living thought"  Oliver Wendell Holmes, Jr.   |
+-----------------------------------------------------------------------------+
""")
for x in text_clf.predict(te):
    print(df.target_names[x])

sci.med
