In [35]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score 

In [4]:
news = fetch_20newsgroups(subset="all", remove=("headers", "footers"))
news

{'data': ["\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n",
  'My brother is in the market for a high-performance video card that supports\nVESA local bus with 1-2MB RAM.  Does anyone have suggestions/ideas on:\n\n  - Diamond Stealth Pro Local Bus\n\n  - Orchid Farenheit 1280\n\n  - ATI Graphics Ultra Pro\n\n  - Any othe

In [5]:
news.filenames

array(['/home/aditya/scikit_learn_data/20news_home/20news-bydate-test/rec.sport.hockey/54367',
       '/home/aditya/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60215',
       '/home/aditya/scikit_learn_data/20news_home/20news-bydate-train/talk.politics.mideast/76120',
       ...,
       '/home/aditya/scikit_learn_data/20news_home/20news-bydate-train/comp.sys.ibm.pc.hardware/60695',
       '/home/aditya/scikit_learn_data/20news_home/20news-bydate-train/comp.graphics/38319',
       '/home/aditya/scikit_learn_data/20news_home/20news-bydate-test/rec.autos/103195'],
      dtype='<U93')

In [6]:
news.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [7]:
news.target

array([10,  3, 17, ...,  3,  1,  7])

In [8]:
news.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [9]:
len(news.target)

18846

In [10]:
len(news.target_names)

20

In [11]:
dataframe = pd.DataFrame(
    data = news.data,
    columns = ["News"])

In [12]:
dataframe["Label"] = news.target

In [13]:
dataframe

Unnamed: 0,News,Label
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,"|>The student of ""regional killings"" alias Dav...",17
3,In article <1993Apr19.034517.12820@julian.uwo....,3
4,1) I have an old Jasmine drive which I cann...,4
5,In article <7480237@hpfcso.FC.HP.COM> myers@hp...,12
6,>Anyone have a phone number for Applied Engine...,4
7,In article <0foVj7i00WB4MIUmht@andrew.cmu.edu>...,10
8,In article <C5sqz3.EG8@acsu.buffalo.edu> hamme...,10
9,In article <C61Kow.E4z@mailer.cc.fsu.edu> dlec...,19


In [14]:
print(dataframe.iloc[0, :].values[0])



I am sure some bashers of Pens fans are pretty confused about the lack
of any kind of posts about the recent Pens massacre of the Devils. Actually,
I am  bit puzzled too and a bit relieved. However, I am going to put an end
to non-PIttsburghers' relief with a bit of praise for the Pens. Man, they
are killing those Devils worse than I thought. Jagr just showed you why
he is much better than his regular season stats. He is also a lot
fo fun to watch in the playoffs. Bowman should let JAgr have a lot of
fun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final
regular season game.          PENS RULE!!!




In [15]:
print(dataframe.iloc[0, :].values[1])

10


In [16]:
print(news.target_names[10])

rec.sport.hockey


In [17]:
new_dataframe = pd.DataFrame(
    data = dataframe.iloc[0:5000, :],
    columns = ["News"])

In [18]:
new_dataframe["Label"] = dataframe.iloc[0:5000, 1]

In [19]:
new_dataframe

Unnamed: 0,News,Label
0,\n\nI am sure some bashers of Pens fans are pr...,10
1,My brother is in the market for a high-perform...,3
2,"|>The student of ""regional killings"" alias Dav...",17
3,In article <1993Apr19.034517.12820@julian.uwo....,3
4,1) I have an old Jasmine drive which I cann...,4
5,In article <7480237@hpfcso.FC.HP.COM> myers@hp...,12
6,>Anyone have a phone number for Applied Engine...,4
7,In article <0foVj7i00WB4MIUmht@andrew.cmu.edu>...,10
8,In article <C5sqz3.EG8@acsu.buffalo.edu> hamme...,10
9,In article <C61Kow.E4z@mailer.cc.fsu.edu> dlec...,19


In [20]:
tf_idf_vect = TfidfVectorizer()

In [21]:
tf_idf_fitted_transformed = tf_idf_vect.fit_transform(new_dataframe["News"])

In [22]:
tf_idf_fitted_transformed

<5000x71904 sparse matrix of type '<class 'numpy.float64'>'
	with 648701 stored elements in Compressed Sparse Row format>

In [23]:
data_tfidf = np.asarray(tf_idf_fitted_transformed.toarray())

In [24]:
new_dataframe_tf_idf = pd.DataFrame(
    data = data_tfidf,
)

In [25]:
new_dataframe_tf_idf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71894,71895,71896,71897,71898,71899,71900,71901,71902,71903
0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.186993,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
x = new_dataframe_tf_idf.iloc[:, :]
y = new_dataframe.loc[:, "Label"]
XTrain, XTest, YTrain, YTest = train_test_split(x, y, test_size=0.2, random_state=10)

In [27]:
from sklearn.ensemble import RandomForestClassifier

In [28]:
YTrain

2733     7
447      4
970     12
125      4
798     12
1337     5
4572     1
4002    14
56       0
567     11
4037     3
1451     2
885     19
929      2
4444     9
4147     4
1938    13
3168    11
3641     1
3290    19
2803    16
1766    12
4749    11
2094    16
278     11
4242    17
3543    12
1353    13
3082    14
2087     5
        ..
1366    17
3932    12
653      5
1406     8
409     14
4109    16
4173     4
3126     7
974      4
574      8
3435    18
3416    15
2102    12
2443     9
239      5
4452    16
2550    15
4136     5
1097    19
1032     0
2042     6
1949     7
1520    18
4829    13
2009    18
1180     2
3441    14
1344    16
4623     7
1289     8
Name: Label, Length: 4000, dtype: int64

In [29]:
RandomForest = RandomForestClassifier()

In [30]:
RandomForest.fit(XTrain, YTrain)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [31]:
predictions = RandomForest.predict(XTest)

In [34]:
accuracy_score(predictions, YTest)

0.474

# Pre Processing Data 
Stripping, Stemming

In [36]:
from gensim.parsing import preprocessing, remove_stopwords

In [40]:
FILTERS = [
    preprocessing.strip_numeric,
    preprocessing.strip_non_alphanum,
    preprocessing.strip_multiple_whitespaces,
    preprocessing.strip_punctuation,
    preprocessing.strip_tags,
    preprocessing.stem_text
]

def clean(string):
    word_list = preprocessing.preprocess_string(string, FILTERS)
    return ' '.join(word.lower() for word in word_list)

In [45]:
clean_news_list = [clean(news) for news in new_dataframe["News"]]
# for news in new_dataframe["News"]:
#     clean_news = clean(news)
#     clean_news_list.append(clean_news)

In [46]:
clean_news_list

['i am sure some basher of pen fan ar pretti confus about the lack of ani kind of post about the recent pen massacr of the devil actual i am bit puzzl too and a bit reliev howev i am go to put an end to non pittsburgh relief with a bit of prais for the pen man thei ar kill those devil wors than i thought jagr just show you why he is much better than hi regular season stat he is also a lot fo fun to watch in the playoff bowman should let jagr have a lot of fun in the next coupl of game sinc the pen ar go to beat the pulp out of jersei anywai i wa veri disappoint not to see the island lose the final regular season game pen rule',
 'my brother is in the market for a high perform video card that support vesa local bu with mb ram doe anyon have suggest idea on diamond stealth pro local bu orchid farenheit ati graphic ultra pro ani other high perform vlb card pleas post or email thank you matt',
 'the student of region kill alia davidian not the davidian religio sect write greater armenia wo

In [47]:
new_dataframe["Clean News"] = clean_news_list

In [48]:
new_dataframe

Unnamed: 0,News,Label,Clean News
0,\n\nI am sure some bashers of Pens fans are pr...,10,i am sure some basher of pen fan ar pretti con...
1,My brother is in the market for a high-perform...,3,my brother is in the market for a high perform...
2,"|>The student of ""regional killings"" alias Dav...",17,the student of region kill alia davidian not t...
3,In article <1993Apr19.034517.12820@julian.uwo....,3,in articl apr julian uwo ca wlsmith valv heart...
4,1) I have an old Jasmine drive which I cann...,4,i have an old jasmin drive which i cannot us w...
5,In article <7480237@hpfcso.FC.HP.COM> myers@hp...,12,in articl hpfcso fc hp com myer hpfcso fc hp c...
6,>Anyone have a phone number for Applied Engine...,4,anyon have a phone number for appli engin so i...
7,In article <0foVj7i00WB4MIUmht@andrew.cmu.edu>...,10,in articl fovjiwbmiumht andrew cmu edu mamatha...
8,In article <C5sqz3.EG8@acsu.buffalo.edu> hamme...,10,in articl csqz eg acsu buffalo edu hammerl acs...
9,In article <C61Kow.E4z@mailer.cc.fsu.edu> dlec...,19,in articl ckow ez mailer cc fsu edu dlecoint g...


In [53]:
tf_idf_vect_clean = TfidfVectorizer()

In [54]:
tf_idf_fitted_transformed_clean = tf_idf_vect.fit_transform(new_dataframe["Clean News"])

In [55]:
tf_idf_fitted_transformed_clean

<5000x43696 sparse matrix of type '<class 'numpy.float64'>'
	with 586006 stored elements in Compressed Sparse Row format>

In [56]:
data_tfidf_clean = np.asarray(tf_idf_fitted_transformed.toarray())

In [58]:
new_dataframe_tf_idf_clean = pd.DataFrame(
    data = data_tfidf_clean,
)

In [60]:
x = new_dataframe_tf_idf_clean.iloc[:, :]
y = new_dataframe.loc[:, "Label"]
XTrain, XTest, YTrain, YTest = train_test_split(x, y, test_size=0.2, random_state=10)

In [61]:
RandomForestClean = RandomForestClassifier()

In [63]:
RandomForestClean.fit(XTrain, YTrain)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [65]:
RandomForestClean.predict(XTest)

array([ 6,  6,  9,  2, 12, 11, 14,  4,  5, 15, 12,  5,  8, 10,  6,  5,  1,
        8,  1, 13, 15, 13, 12,  3,  5, 14,  0,  4,  9,  1,  6,  5,  0,  6,
       18,  3,  8, 15,  2,  2,  0,  8,  2, 10,  1,  8, 12,  5,  4,  5, 12,
       13, 15,  6, 17,  3, 16,  6, 11,  2,  2,  5,  8, 11,  3, 17,  0, 11,
        5,  0,  0,  8, 10, 14,  3,  1, 10,  1,  9,  2, 13, 15,  4,  6,  1,
        7,  0,  9,  2, 10, 15, 11,  3,  3, 13,  4,  2,  5, 17, 10, 15,  0,
       17,  4,  2,  9, 17, 14,  8, 18,  7,  9,  5,  1,  6, 18,  3, 17, 11,
       11, 15, 17,  6, 15, 13, 13,  7,  5,  1,  7,  9,  6, 10,  5,  1,  3,
       17, 15,  3,  8,  6,  7,  4,  9, 12, 14,  5,  2, 14,  8,  2, 11,  1,
        3, 15, 12, 15,  5,  7,  1, 17,  1, 10,  8,  1,  1,  0,  5,  1,  8,
       14,  7,  2,  0,  6, 10,  6,  8,  7,  4,  4,  8, 10,  8, 14,  0, 10,
       19,  0,  1,  5,  8,  2, 13,  3,  5, 15,  5, 11,  4, 16, 16,  1, 10,
        3, 15,  1,  1,  4,  2, 12, 11, 15, 11, 15,  3,  3,  1, 17, 11,  3,
       18,  5,  5,  0,  1

In [66]:
accuracy_score(RandomForestClean.predict(XTest), YTest)

0.494

In [67]:
RandomForestClean.predict(XTrain)

array([ 7,  4, 12, ..., 16,  7,  8])

In [68]:
accuracy_score(RandomForestClean.predict(XTrain), YTrain)

0.99625

In [69]:
RandomForestClean = RandomForestClassifier(n_jobs=-1, n_estimators=200, criterion="gini")

In [70]:
RandomForestClean.fit(XTrain, YTrain)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [71]:
predictions_clean = RandomForestClean.predict(XTest)

In [72]:
accuracy_score(predictions_clean, YTest)

0.72

In [75]:
accuracy_score(RandomForestClean.predict(XTrain), YTrain)

0.99875