In [None]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset = 'all', random_state=1)

In [None]:
print(news_data.keys())

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])


As you can see, there are 20 types of target value, and each target names can be checked in target_names

In [None]:
import pandas as pd
print('value of target and distribution \n',pd.Series(news_data.target).value_counts().sort_index())
print('name of target \n', news_data.target_names)

value of target and distribution 
 0     799
1     973
2     985
3     982
4     963
5     988
6     975
7     990
8     996
9     994
10    999
11    991
12    984
13    990
14    987
15    997
16    910
17    940
18    775
19    628
dtype: int64
name of target 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


# preprocessing

To check the each of data value, print first of the data.

You can see that, in the context, it contain variable kinds of data like e-mail adress, title name, writer info, etc. We only want the context of the article, so we have to remove all the other thing.

In [None]:
print(news_data.data[0])

From: pharvey@quack.kfu.com (Paul Harvey)
Subject: Re: Clarification of personal position
Organization: The Duck Pond public unix: +1 408 249 9630, log in as 'guest'.
Lines: 26

In article <C5rBHt.Fw4@athena.cs.uga.edu> 
hudson@athena.cs.uga.edu (Paul Hudson Jr) writes:
>In article <C5MuIw.AqC@mailer.cc.fsu.edu> 
dlecoint@garnet.acns.fsu.edu (Darius_Lecointe) writes:
>>If it were a sin to violate Sunday no one could
>>ever be forgiven for that for Jesus never kept Sunday holy.  He only
>>recognized one day of the seven as holy.
>Jesus also recognized other holy days, like the Passover.  Acts 15 says 
>that no more should be layed on the Gentiles than that which is necessary.
>The sabbath is not in the list, nor do any of the epistles instruct people
>to keep the 7th day, while Christians were living among people who did not
>keep the 7th day.  It looks like that would have been a problem.
>Instead, we have Scriptures telling us that all days can be esteemed alike
>(Romans 14:5) and tha

In [None]:


#train data
train_news = fetch_20newsgroups(subset = 'train',
                   remove = ('headers','footers','quotes'),
                  random_state = 1)

X_train = train_news.data
y_train = train_news.target

#test data
test_news = fetch_20newsgroups(subset = 'test',
                   remove = ('headers','footers','quotes'),
                  random_state = 1)

X_test = test_news.data
y_test = test_news.target

print('train data : {0}, test data : {1}'.format(len(train_news.data),len(test_news.data)))

train data : 11314, test data : 7532


# feature vector

In this data, we will use Bag of Words model. It is a model that ignore the sequence and put the frequency of word into the feature vector. there are two types of Bag of Words model.

* based on count

It put just frequency to each word.

* based on TF-IDF(Term Frequency - Inverse Document Frequency)

It put frequency to each word in each document and give penalty to overall frequently appeared word.

# count

At first, we will use count feature vector and apply to each of logistic regression, linear SVM and naive bayes model. Let's compare the accuracy.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

#feature vector based on count
cnt_vect = CountVectorizer()
cnt_vect.fit(X_train)
X_train_cnt_vect = cnt_vect.transform(X_train)

#apply to test data
X_test_cnt_vect = cnt_vect.transform(X_test)

print('CountVectorizer of train data: ',X_train_cnt_vect.shape )
print('it means there are 11314 of context and 101631 of different word in here!')

CountVectorizer of train data:  (11314, 101631)
it means there are 11314 of context and 101631 of different word in here!


In [None]:
# apply to logistic regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()
lr_clf.fit(X_train_cnt_vect,y_train)
pred_1 = lr_clf.predict(X_test_cnt_vect)
accuracy_score(pred_1,y_test)
print('accuracy of logistic : ', accuracy_score(pred_1,y_test))

accuracy of logistic :  0.6060807222517259


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# apply linear SVM
from sklearn.svm import LinearSVC

ls_clf = LinearSVC()
ls_clf.fit(X_train_cnt_vect,y_train)
pred_2 = ls_clf.predict(X_test_cnt_vect)
print('accuracy of linear SVM : ', accuracy_score(pred_2,y_test))

accuracy of linear SVM :  0.571959638874137




In [None]:
# apply naive bayes
from sklearn.naive_bayes import BernoulliNB

nb_clf = BernoulliNB()
nb_clf.fit(X_train_cnt_vect,y_train)
pred_3 = nb_clf.predict(X_test_cnt_vect)
print('accuracy of naive bayes : ', accuracy_score(pred_3,y_test))

accuracy of naive bayes :  0.4579129049389272


# TF-IDF

At second, we will use TF-IDF feature vector and apply to each of logistic regression, linear SVM and naive bayes model. Let's compare the accuracy.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#feature vector based on count
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf_vect = tfidf_vect.transform(X_train)

#apply to test data
X_test_tfidf_vect = tfidf_vect.transform(X_test)

In [None]:
# apply to logistic regression

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect,y_train)
pred_4 = lr_clf.predict(X_test_tfidf_vect)
print('accuracy of logistic : ', accuracy_score(pred_4,y_test))

accuracy of logistic :  0.6736590546999469


In [None]:
# apply linear SVM

ls_clf = LinearSVC()
ls_clf.fit(X_train_tfidf_vect,y_train)
pred_5 = ls_clf.predict(X_test_tfidf_vect)
print('accuracy of linear SVM : ', accuracy_score(pred_5,y_test))

accuracy of linear SVM :  0.6919808815719597


In [None]:
# apply naive bayes

nb_clf = BernoulliNB()
nb_clf.fit(X_train_tfidf_vect,y_train)
pred_6 = nb_clf.predict(X_test_tfidf_vect)
print('accuracy of naive bayes : ', accuracy_score(pred_6,y_test))

accuracy of naive bayes :  0.4579129049389272


# TF-IDF with hyperparameter

At last, we will use TF-IDF feature vector and assign some hyperparameter to this stuff. Let's compare the accuracy.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#feature vector based on TF-IDF with parameter
tfidf_vect_p = TfidfVectorizer(stop_words='english', ngram_range=(1,2),max_df=300)
tfidf_vect_p.fit(X_train)
X_train_tfidf_vect = tfidf_vect_p.transform(X_train)

#apply to test data
X_test_tfidf_vect = tfidf_vect_p.transform(X_test)

In [None]:
# apply to logistic regression

lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf_vect,y_train)
pred_7 = lr_clf.predict(X_test_tfidf_vect)
print('accuracy of logistic : ', accuracy_score(pred_7,y_test))

accuracy of logistic :  0.6922464152947424


In [None]:
# apply linear SVM

ls_clf = LinearSVC()
ls_clf.fit(X_train_tfidf_vect,y_train)
pred_8 = ls_clf.predict(X_test_tfidf_vect)
print('accuracy of linear SVM : ', accuracy_score(pred_8,y_test))

accuracy of linear SVM :  0.7060541688794477


In [None]:
# apply naive bayes

nb_clf = BernoulliNB()
nb_clf.fit(X_train_tfidf_vect,y_train)
pred_9 = nb_clf.predict(X_test_tfidf_vect)
print('accuracy of naive bayes : ', accuracy_score(pred_9,y_test))

accuracy of naive bayes :  0.20061072756240042


# conclusion

Among the 9 model, the model that apply TF-IDF with parameter and linear SVM model is most accurate !

In [None]:
print('\t\t','logistic\t','SVM\t\t','naive bayes')
print('-'*80)
print('count \t\t','0.6068\t\t','0.5720\t\t','0.4579')
print('TF-IDF \t\t','0.6736\t\t','0.6919\t\t','0.4579')
print('TF-IDF\t\t','0.6922\t\t','0.7060\t\t','0.2006')


		 logistic	 SVM		 naive bayes
--------------------------------------------------------------------------------
count 		 0.6068		 0.5720		 0.4579
TF-IDF 		 0.6736		 0.6919		 0.4579
TF-IDF		 0.6922		 0.7060		 0.2006
