# 20 뉴스 그룹 분류

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# 내장 데이터 가져오는 땐 load, 인터넷에서 가져올 떈 fetch
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset="all", random_state=156)

데이터 탐색

In [None]:
news_data.keys()

In [None]:
print('target 클래스의 값과 분포도')
print(pd.Series(news_data.target).value_counts().sort_index())

In [None]:
print('target 클래스의 이름들 \n', news_data.target_names)

In [None]:
print(news_data.data[0])

훈련/테스트용 데이터 추출

In [None]:
train_news = fetch_20newsgroups(subset="train", random_state=156,
                                remove=('headers','footers','quotes'))
X_train = train_news.data
y_train = train_news.target

In [None]:
print(train_news.data[0])

In [None]:
print(train_news.target[0])


In [None]:
test_news = fetch_20newsgroups(subset="test", random_state=156,
                                remove=('headers','footers','quotes'))
X_test = test_news.data
y_test = test_news.target

In [None]:
print(len(X_train), len(X_test))

Train_data 를 숫자로 변환한 이후 Dt, Svc 등의 모델을 학습시키고 테스트한다. 
### 피처 벡터화 변환과 머신러닝 모델 학습/예측/평가
1. CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
# 트레인셋과 테스트셋을 동일한 벡터라이저를 통해 fit해야 한다.
# fit_transform을 써서 fit과 변환이 한 단계로 축약되기때문에 사용불가
# 반드시 fit과 transform을 나눠서 시행해야 함
cvect.fit(X_train)
X_train_count = cvect.transform(X_train)
X_test_count = cvect.transform(X_test)

In [None]:
# 문장 11314개, 단어수 101631개
X_train_count.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf = LogisticRegression()
lr_clf.fit(X_train_count, y_train)

In [None]:
# 지금은 꼭 숫자로 바꾼 X_train_count를 넣어줘야 함
pred = lr_clf.predict(X_test_count)
accuracy_score(y_test, pred)

In [None]:
print(y_test[:5])
print(pred[:5])

2. TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [None]:
X_train_tfidf.shape

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf, y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test,pred)

### 파라미터 주기. 파라미터 찾는 건 gridSearchCV 해야 함(시간 매우 오래 걸림)
3. stop_words filtering, ngram (1,2), max_df = 300

In [None]:
tfidf_vect = TfidfVectorizer(stop_words = 'english', ngram_range = (1,2), max_df = 300)
tfidf_vect.fit(X_train)
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [None]:
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf, y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test,pred)

4. 3번에서 logisticRegression의 C값을 10으로 변경

In [None]:
lr_clf = LogisticRegression(C =10)
lr_clf.fit(X_train_tfidf, y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test,pred)

### Pipeline과 GridSearchCV을 통한 하이퍼 파라미터 튜닝

In [None]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('lr_clf', LogisticRegression())
])

In [None]:
params = {
    # Father - child relation
    'tfidf_vect__ngram_range' : [(1,1), (1,2)], 
    'tfidf_vect__max_df' : [300,700], 
    'lr_clf__C' : [1, 10] 
}

In [None]:
from sklearn.model_selection import GridSearchCV

grid_pipe = GridSearchCV(pipeline, param_grid = params, cv = 3,
                        scoring = 'accuracy', verbose = 1, n_jobs = -1)
grid_pipe.fit(X_train, y_train)

In [None]:
grid_pipe

In [None]:
print(grid_pipe.best_params_, grid_pipe.best_score_)

In [None]:
# 이거 저장해야 해!
import joblib
joblib.dump(grid_pipe, './model/news_vect.pkl')