# 20 뉴스 그룹 분류

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all',random_state=156)

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


### 데이터 탐색

In [4]:
news_data.keys()

dict_keys(['data', 'filenames', 'target_names', 'target', 'DESCR'])

In [5]:
print('target 클래스의 값과 분포도')
print(news_data.DESCR)

target 클래스의 값과 분포도
.. _20newsgroups_dataset:

The 20 newsgroups text dataset
------------------------------

The 20 newsgroups dataset comprises around 18000 newsgroups posts on
20 topics split in two subsets: one for training (or development)
and the other one for testing (or for performance evaluation). The split
between the train and test set is based upon a messages posted before
and after a specific date.

This module contains two loaders. The first one,
:func:`sklearn.datasets.fetch_20newsgroups`,
returns a list of the raw texts that can be fed to text feature
extractors such as :class:`sklearn.feature_extraction.text.CountVectorizer`
with custom parameters so as to extract feature vectors.
The second one, :func:`sklearn.datasets.fetch_20newsgroups_vectorized`,
returns ready-to-use features, i.e., it is not necessary to use a feature
extractor.

**Data Set Characteristics:**

    Classes                     20
    Samples total            18846
    Dimensionality               1


In [6]:
print('target 클래스의 이름들 \n',news_data.target_names)

target 클래스의 이름들 
 ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']


In [7]:
train_news = fetch_20newsgroups(subset='train',random_state=156,
                                remove=('headers','footers','quotes'))
X_train = train_news.data
y_train = train_news.target


In [8]:
print(X_train[0],y_train[0])



What I did NOT get with my drive (CD300i) is the System Install CD you
listed as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120
from Direct Express in Chicago (no complaints at all -- good price & good
service).

BTW, I've heard that the System Install CD can be used to boot the mac;
however, my drive will NOT accept a CD caddy is the machine is off.  How can
you boot with it then?

--Dave
 4


In [9]:
test_news = fetch_20newsgroups(subset='test',random_state=156,
                                remove=('headers','footers','quotes'))
X_test = test_news.data
y_test = test_news.target

In [10]:
len(X_train),len(X_test)

(11314, 7532)

## 피쳐 벡터화 변환과 머신러닝 모델 학습/예측/평가
- Case 1, CountVectorizer

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(X_train)
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

In [12]:
X_train_count.shape

(11314, 101631)

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()
lr_clf.fit(X_train_count,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
pred = lr_clf.predict(X_test_count)
accuracy_score(y_test,pred)

0.6076739245884227

In [15]:
print(y_test[:5])
print(pred[:5])

[ 4 11  1  7  8]
[ 4 11  6  7  8]


- TfidVectorizer

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [17]:
X_train_tfidf.shape

(11314, 101631)

In [18]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf,y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test,pred)

0.6736590546999469

In [19]:
print(y_test[:5])
print(pred[:5])

[ 4 11  1  7  8]
[5 1 1 7 8]


### 학습에 사용된 파라미터

In [20]:
print(tfidf_vect.get_params())

{'analyzer': 'word', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}


In [21]:
print(lr_clf.get_params())

{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


- CASE 3. stopword filtering,ngram (1,2) max_df 300

In [22]:
tfidf_vect = TfidfVectorizer(stop_words='english',ngram_range=(1,2),max_df=300)
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [23]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf,y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test,pred)

0.6922464152947424

- Case 4, Case 3에서 LogisticRegression C값을 10으로

In [24]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf,y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test,pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7010090281465746

### Pipline 과 GridSearchCV 를 통한 파라미터 튜닝

In [27]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
                     ('tfidf_vect',TfidfVectorizer(stop_words='english')),
                     ('lr_clf', LogisticRegression())
])

In [28]:
params = {
    'tfidf_vect__ngram_range':[(1,1),(1,2)],
    'tfidf_vect__max_df':[300,700],
    'lr_clf__C':[1,10]
}

In [29]:
from sklearn.model_selection import GridSearchCV

grid_pipe = GridSearchCV(pipeline, param_grid=params,cv=3,
                         scoring='accuracy',verbose=1,n_jobs=-1)
grid_pipe.fit(X_train,y_train)
print(grid_pipe.best_params_,grid_pipe.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 39.9min finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


{'lr_clf__C': 10, 'tfidf_vect__max_df': 300, 'tfidf_vect__ngram_range': (1, 2)} 0.7536687914006531


In [30]:
pred = grid_pipe.predict(X_test)
accuracy_score(y_test,pred)

0.7010090281465746