# Select 10000 most frequent words in documents

In [248]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
#from nltk import word_tokenize          
#from nltk.stem import WordNetLemmatizer
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline

import pandas as pd

## Importing the 20newsgroup data

In [249]:
#Loading the data set.
from sklearn.datasets import fetch_20newsgroups
categories = ['rec.motorcycles', 'comp.sys.ibm.pc.hardware','soc.religion.christian', 'rec.sport.hockey', 'misc.forsale']
mydata = fetch_20newsgroups(categories=categories, shuffle=True, remove = ('headers', 'footers', 'quotes'))

## Data Preparation
We will perform preparation steps such as Convert everything to lowercase, Removing stopwords and converting text data to a vector representation

In [250]:
mydata_df = pd.DataFrame({'data': mydata.data, 'target': mydata.target})
mydata_df.head()

Unnamed: 0,data,target
0,"\nYes, yes, yes. Motorcycling is slightly dif...",2
1,Diamond engagement ring. 14k gold band. 33po...,1
2,\n\nThere are a lot of people running around s...,4
3,": \n: >I bought it, I tried it:\n: \n: >It is,...",2
4,Two questions:\n1: I'm trying to figure out ho...,0


In [251]:
# Text preprocessing steps - remove numbers, captial letters and punctuation
import re
import string

alphanumeric = lambda x: re.sub(r"""\w*\d\w*""", ' ', x)
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower())

mydata_df['data'] = mydata_df.data.map(alphanumeric).map(punc_lower)
print(mydata_df.head())
print(mydata_df.shape)
print(mydata_df.target.sort_values())

                                                data  target
0  \nyes  yes  yes   motorcycling is slightly dif...       2
1  diamond engagement ring     gold band      dia...       1
2  \n\nthere are a lot of people running around s...       4
3    \n   i bought it  i tried it \n  \n   it is ...       2
4  two questions \n   i m trying to figure out ho...       0
(2972, 2)
977     0
1352    0
2049    0
2045    0
2037    0
       ..
666     4
668     4
1590    4
1580    4
474     4
Name: target, Length: 2972, dtype: int64


## Count Vectorizer

In [252]:
# Extracting features from text files
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(stop_words='english')

X_cv = count_vect.fit_transform(mydata_df.data)  # fit_transform learns the vocab and one-hot encodes

print(X_cv.shape)

(2972, 25248)


In [253]:
X_cv_df = pd.DataFrame(X_cv.todense())
X_cv_df.columns = sorted(count_vect.vocabulary_)

X_cv_df.head()

Unnamed: 0,aa,aaa,aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaauuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuugggggggggggggggg,aaahhhh,aacc,aamazing,aamrl,aanbieden,aangezien,aargh,...,zr,zubov,zuiko,zumdahl,zupancic,zurich,zwl,zx,zygot,zzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [254]:
# We will sum the number of ocurrences and the number of documents appeared.

sums = X_cv_df.sum()
print(sums)
print()

sums_documents = X_cv_df.astype(bool).sum(axis=0)
print(sums_documents)

aa                                                                                  5
aaa                                                                                 1
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaauuuuuuuuuuuuuuuuuuuuuuuuuuuuuuuugggggggggggggggg     1
aaahhhh                                                                             1
aacc                                                                                1
                                                                                   ..
zurich                                                                              1
zwl                                                                                 1
zx                                                                                 15
zygot                                                                               1
zzzzzz                                                                              1
Length: 25248, dtype: int64

aa                       

In [255]:
sums_sorted = np.argsort(np.argsort(sums)).to_list()
df_indexes = pd.DataFrame({'Initial index': range(25248), 'Sorted index': sums_sorted})
df_indexes = df_indexes.sort_values('Sorted index', ignore_index=True)
df_indexes

Unnamed: 0,Initial index,Sorted index
0,25247,0
1,9687,1
2,9688,2
3,9689,3
4,9691,4
...,...,...
25243,6583,25243
25244,12336,25244
25245,12031,25245
25246,12943,25246


## Dataset 1

In [256]:
# We will select the 10 000 words that are most frequent and occur in not too small number of documents.

i = 25247
attributes = 0
attributes_10000_position = []

while attributes < 10000 and i >= 0: 
    index = df_indexes.at[i, 'Initial index']
    if sums_documents[index] > 1: 
        attributes_10000_position.append(index)
        attributes = attributes + 1
        
    i = i - 1
                                         
len(attributes_10000_position)

10000

In [257]:
attributes_names = X_cv_df.columns
X_cv_df = X_cv_df[attributes_names[attributes_10000_position]]
X_cv_df

Unnamed: 0,god,like,just,know,don,new,time,good,think,people,...,craving,tenderness,cakewalk,lighted,feds,fostering,victories,pwr,lamps,signifies
0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,2,1,1,0,1,0,3,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2967,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2969,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2970,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [258]:
# Insert in the fir
target = mydata_df.target.to_list()
transformation = {0: 'rec.motorcycles', 1: 'comp.sys.ibm.pc.hardware', 2: 'soc.religion.christian', 3: 'rec.sport.hockey', 4: 'misc.forsale'}

new_target = [transformation[v] for v in target]

X_cv_df["Category"] = new_target
X_cv_df.head()

Unnamed: 0,god,like,just,know,don,new,time,good,think,people,...,tenderness,cakewalk,lighted,feds,fostering,victories,pwr,lamps,signifies,Category
0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,soc.religion.christian
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,comp.sys.ibm.pc.hardware
2,5,0,0,2,1,1,0,1,0,3,...,0,0,0,0,0,0,0,0,0,misc.forsale
3,0,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,soc.religion.christian
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,rec.motorcycles


In [259]:
X_cv_df.to_csv('dataset1.csv', index=None, sep=',')

## Dataset 2

In [234]:
# We will select the most frequent words that occur in more than 5 documents.

i = 25247
attributes = 0
attributes_position = []

while attributes < 10000 and i >= 0: 
    index = df_indexes.at[i, 'Initial index']
    if sums_documents[index] > 5: 
        attributes_position.append(index)
        attributes = attributes + 1
        
    i = i - 1
                                         
len(attributes_position)

4906

In [235]:
attributes_names = X_cv_df.columns
X_cv_df2 = X_cv_df[attributes_names[attributes_position]]
X_cv_df2

Unnamed: 0,god,like,just,know,don,new,time,good,think,people,...,violation,bell,viewing,bdi,girlfriend,porsche,bent,va,pound,beaten
0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,5,0,0,2,1,1,0,1,0,3,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2967,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2968,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2969,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2970,0,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [237]:
# Insert in the fir
target = mydata_df.target.to_list()
transformation = {0: 'rec.motorcycles', 1: 'comp.sys.ibm.pc.hardware', 2: 'rec.motorcycles''soc.religion.christian', 3: 'rec.sport.hockey', 4: 'misc.forsale'}

new_target = [transformation[v] for v in target]

X_cv_df2["Category"] = new_target
X_cv_df2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_cv_df2["Category"] = new_target


Unnamed: 0,god,like,just,know,don,new,time,good,think,people,...,bell,viewing,bdi,girlfriend,porsche,bent,va,pound,beaten,Category
0,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,soc.religion.christian
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,comp.sys.ibm.pc.hardware
2,5,0,0,2,1,1,0,1,0,3,...,0,0,0,0,0,0,0,0,0,misc.forsale
3,0,0,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,soc.religion.christian
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,rec.motorcycles


In [238]:
X_cv_df2.to_csv('dataset2.csv', index=None, sep=',')

In [265]:
mydata_df.head(20)

Unnamed: 0,data,target
0,\nyes yes yes motorcycling is slightly dif...,2
1,diamond engagement ring gold band dia...,1
2,\n\nthere are a lot of people running around s...,4
3,\n i bought it i tried it \n \n it is ...,2
4,two questions \n i m trying to figure out ho...,0
5,anyone who really believes that the caps can b...,3
6,\ni ll post a summary after i get enough infor...,2
7,\n it is certainly possible and quite easy...,2
8,\n\ntrashy move from a trashy organization af...,3
9,\ngee you d think winnipeg would be tops on t...,3
