# 20 뉴스 그룹 모델 생성

In [13]:
import numpy as np 
import pandas as pd
import joblib

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### 훈련/테스트용 데이터 추출

In [15]:
from sklearn.datasets import fetch_20newsgroups

In [16]:
df_train = fetch_20newsgroups(subset='train', random_state=156,
                                remove=('headers', 'footers', 'quotes'))
df_test = fetch_20newsgroups(subset='test', random_state=156,
                                remove=('headers', 'footers', 'quotes'))

In [None]:
X_train, y_train = df_train.data, df_train.target
X_test, y_test = df_test.data, df_test.target

In [5]:
len(X_train), len(X_test)

(11314, 7532)

### (1) CountVectorizer + LogisticRegression

In [6]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'Count Vectorizer + Logistic Regression 정확도: {acc:.4f}')

Count Vectorizer + Logistic Regression 정확도: 0.5936


In [7]:
joblib.dump(pipeline, '../static/model/news_count_lr.pkl')

['../static/model/news_count_lr.pkl']

### (2) TfidfVectorizer + LogisticRegression

In [8]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'Tfidf Vectorizer + Logistic Regression 정확도: {acc:.4f}')

Tfidf Vectorizer + Logistic Regression 정확도: 0.6583


In [9]:
joblib.dump(pipeline, '../static/model/news_tfidf_lr.pkl')

['../static/model/news_tfidf_lr.pkl']

### (3) TfidfVectorizer + SVM

In [10]:
from sklearn.svm import SVC

In [11]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(ngram_range=(1, 2))),
    ('sv_clf', SVC())
])
pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'Tfidf Vectorizer + Support Vector Machine 정확도: {acc:.4f}')

Tfidf Vectorizer + Support Vector Machine 정확도: 0.6312


In [12]:
joblib.dump(pipeline, '../static/model/news_tfidf_sv.pkl')

['../static/model/news_tfidf_sv.pkl']