# 20 뉴스 그룹 모델 생성

In [1]:
import numpy as np 
import pandas as pd
import joblib

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### train 데이터

In [10]:
from sklearn.datasets import fetch_20newsgroups

In [11]:
train_news = fetch_20newsgroups(subset='train', random_state=156, 
                                remove=('headers', 'footers', 'quotes'))

In [12]:
df_train = pd.DataFrame(train_news.data, columns=['data'])
df_train['target'] = train_news.target
df_train.head(3)

Unnamed: 0,data,target
0,\n\nWhat I did NOT get with my drive (CD300i) ...,4
1,"\n\t""And in that day you will ask Me no questi...",15
2,\nI have to disagree with you on this one. It...,10


In [14]:
df_train.data[1]

'\n\t"And in that day you will ask Me no question.  Truly, truly, I say to \n\tyou, if you shall ask the Father for anything, He will give it to you \n\tin my name.  Until now you have asked for nothing in My name; ask, and \n\tyou will receive, that your joy may be made full."\n\t\t\t\t-John 16:23-24\n\nI don\'t believe that we necessarily have to say " . . . In Christ\'s name.  \nAmen," for our prayers to be heard, but it glorifies the Son, when we \nacknowledge that our prayer is made possible by Him.  I believe that just as \nthose who were saved in the OT, could only be saved because Jesus would one day\nreconcile God to man, He is the only reason their prayers would be heard by \nGod.\n\n\tFor all of us have become like one who is unclean,\n\tAnd all our righteous deeds are like a filthy garment;\n\tAnd all of us wither like a leaf,\n\tand our iniquities, like the wind, take us away.\n\t\t\t\t-Isaiah 64:6, NAS\n\nOur prayers like the rest of our deeds are too unholy to go directl

In [16]:
df_train['data'] = df_train['data'].str.replace('\n|\t', '')
df_train.head(3)

Unnamed: 0,data,target
0,What I did NOT get with my drive (CD300i) is t...,4
1,"""And in that day you will ask Me no question. ...",15
2,I have to disagree with you on this one. It i...,10


- null 값이 있는지 확인

In [17]:
df_train.isna().sum()

data      0
target    0
dtype: int64

- 공백인 값이 있는지 확인 후 제거

In [18]:
df_train[df_train.data == ''].count()

data      289
target    289
dtype: int64

In [19]:
df_train = df_train.drop(df_train[df_train.data == ''].index)
df_train[df_train.data == ''].count()

data      0
target    0
dtype: int64

In [20]:
df_train.shape

(11025, 2)

In [21]:
print(df_train.data[0])
print(df_train.target[0])

What I did NOT get with my drive (CD300i) is the System Install CD youlisted as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120from Direct Express in Chicago (no complaints at all -- good price & goodservice).BTW, I've heard that the System Install CD can be used to boot the mac;however, my drive will NOT accept a CD caddy is the machine is off.  How canyou boot with it then?--Dave
4


In [22]:
X_train = df_train.data
y_train = df_train.target

In [23]:
df_train.to_csv('../static/data/news/train.csv')

### test 데이터

In [24]:
test_news = fetch_20newsgroups(subset='test', random_state=156,
                                remove=('headers', 'footers', 'quotes'))

In [25]:
df_test = pd.DataFrame(test_news.data, columns=['data'])
df_test['target'] = test_news.target
df_test.head(3)

Unnamed: 0,data,target
0,The tech support line for GCC is 1-800-231-1570.,4
1,I recently saw a message here (posted by Bob S...,11
2,I cant get through to the author of rtrace. Hi...,1


In [26]:
df_test['data'] = df_test['data'].str.replace('\n|\t', '')
df_test.head(3)

Unnamed: 0,data,target
0,The tech support line for GCC is 1-800-231-1570.,4
1,I recently saw a message here (posted by Bob S...,11
2,I cant get through to the author of rtrace. Hi...,1


- null 값 또는 공백 값이 있는지 확인

In [27]:
df_test.isna().sum()

data      0
target    0
dtype: int64

In [28]:
df_test[df_test.data == ''].count()

data      212
target    212
dtype: int64

In [29]:
df_test = df_test.drop(df_test[df_test.data == ''].index)
df_test[df_test.data == ''].count()

data      0
target    0
dtype: int64

In [30]:
df_test.shape

(7320, 2)

In [31]:
X_test = df_test.data
y_test = df_test.target

In [32]:
df_test.to_csv('../static/data/news/test.csv')

### (1) CountVectorizer + LogisticRegression

In [24]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer(ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])
%time pipeline.fit(X_train, y_train)
%time pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'Count Vectorizer + Logistic Regression 정확도: {acc:.4f}')

Wall time: 7min 5s
Wall time: 3.88 s
Count Vectorizer + Logistic Regression 정확도: 0.6053


In [25]:
joblib.dump(pipeline, '../static/model/news_count_lr.pkl')

['../static/model/news_count_lr.pkl']

In [26]:
del pipeline

### (2) TfidfVectorizer + LogisticRegression

In [27]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(ngram_range=(1, 2))),
    ('lr_clf', LogisticRegression())
])
%time pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'Tfidf Vectorizer + Logistic Regression 정확도: {acc:.4f}')

Wall time: 5min 36s
Tfidf Vectorizer + Logistic Regression 정확도: 0.6714


In [28]:
joblib.dump(pipeline, '../static/model/news_tfidf_lr.pkl')

['../static/model/news_tfidf_lr.pkl']

In [29]:
del pipeline

### (3) TfidfVectorizer + SVM

In [30]:
from sklearn.svm import SVC

In [31]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer(ngram_range=(1, 2))),
    ('sv_clf', SVC())
])
%time pipeline.fit(X_train, y_train)
pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, pred)
print(f'Tfidf Vectorizer + Support Vector Machine 정확도: {acc:.4f}')

Wall time: 6min 21s
Tfidf Vectorizer + Support Vector Machine 정확도: 0.6442


In [32]:
joblib.dump(pipeline, '../static/model/news_tfidf_sv.pkl')

['../static/model/news_tfidf_sv.pkl']

In [33]:
del pipeline