In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import fetch_20newsgroups
news_data = fetch_20newsgroups(subset='all', random_state=156)

In [27]:
news_data.target_names[:5]

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware']

트레인 셋

In [28]:
train_news = fetch_20newsgroups(subset='train', random_state=156, remove=('headers', 'footers', 'quotes'))
train_news.target_names[:5]

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware']

In [6]:
# 트레인 데이터
df_train = pd.DataFrame(train_news.data, columns=['data'])
# 트레인 타겟
df_train['target'] = train_news.target
df_train

Unnamed: 0,data,target
0,\n\nWhat I did NOT get with my drive (CD300i) ...,4
1,"\n\t""And in that day you will ask Me no questi...",15
2,\nI have to disagree with you on this one. It...,10
3,.\n.\n\nTell him he probably needs to upgrade ...,2
4,->\tFirst I want to start right out and say th...,0
...,...,...
11309,"G'day All,\n\nI was looking to build a Paralle...",3
11310,"ites:\n Yeah, and the cop couldn't catch me.....",7
11311,While rummaging through a box of old PC (5150)...,3
11312,"\nIt's always possible, but if this is the cas...",9


In [9]:
# NaN값은 없지만 "", 빈 문자열이 있음
df_train[df_train.data == ''].count()

data      218
target    218
dtype: int64

In [10]:
# "", 빈 문자열을 가진 행 drop
df_train = df_train.drop(df_train[df_train.data == ''].index)
df_train[df_train.data == ''].count()

data      0
target    0
dtype: int64

In [15]:
# 트레인 데이터, 트레인 타겟 추출
X_train = df_train.data
y_train = df_train.target
# 1차원 Series
print(X_train.shape, y_train.shape)
X_train

(11096,) (11096,)


0        \n\nWhat I did NOT get with my drive (CD300i) ...
1        \n\t"And in that day you will ask Me no questi...
2        \nI have to disagree with you on this one.  It...
3        .\n.\n\nTell him he probably needs to upgrade ...
4        ->\tFirst I want to start right out and say th...
                               ...                        
11309    G'day All,\n\nI was looking to build a Paralle...
11310     ites:\n Yeah, and the cop couldn't catch me.....
11311    While rummaging through a box of old PC (5150)...
11312    \nIt's always possible, but if this is the cas...
11313    Hi all,\n\nI've been locked in a small closet ...
Name: data, Length: 11096, dtype: object

In [13]:
# 개별 트레인 데이터의 모습
print(df_train.data[0])



What I did NOT get with my drive (CD300i) is the System Install CD you
listed as #1.  Any ideas about how I can get one?  I bought my IIvx 8/120
from Direct Express in Chicago (no complaints at all -- good price & good
service).

BTW, I've heard that the System Install CD can be used to boot the mac;
however, my drive will NOT accept a CD caddy is the machine is off.  How can
you boot with it then?

--Dave



In [17]:
# 개별 트레인 타겟의 모습
print(df_train.target[0])

4


In [18]:
# 트레인 데이터 
df_train.to_csv('../static/data/advanced/new20_train.csv')

테스트 셋

In [19]:
test_news = fetch_20newsgroups(subset='test', random_state=156, remove=('headers', 'footers', 'quotes'))

In [25]:
# target_names도 당연히 똑같이 나온다.
train_news.target_names[:5]

['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware']


In [21]:
# 똑같이 빈 문자열 제거
df_test = pd.DataFrame(test_news.data, columns=['data'])
df_test['target'] = test_news.target
df_test = df_test.drop(df_test[df_test.data == ''].index)

In [22]:
# 테스트 데이터, 테스트 타겟
X_test = df_test.data
y_test = df_test.target
print(X_test.shape, y_test.shape)

(7370,) (7370,)


In [23]:
# 테스트 데이터 
df_test.to_csv('../static/data/advanced/new20_test.csv')

### 피쳐 벡터화 변화(문자 ==> 숫자)와 머신러닝 모델(분류기) 학습/예측/평가
1. CountVectorizer

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(X_train)
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

In [30]:
X_train_count.shape

(11096, 101631)

In [31]:
# CountVectorizer - 로지스틱 회귀
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression()
lr_clf.fit(X_train_count, y_train)

LogisticRegression()

In [32]:
from sklearn.metrics import accuracy_score

pred = lr_clf.predict(X_test_count)
accuracy_score(y_test, pred)

0.6232021709633649

2. TfidfVectorizer

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer()
tfidf_vect.fit(X_train) 
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [34]:
X_train_tfidf.shape

(11096, 101631)

In [35]:
# TfidfVectorizer - 로지스틱 회귀
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf, y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test, pred)

0.6872455902306649

In [38]:
# TfidfVectorizer - SVC
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_tfidf, y_train)
pred = svc.predict(X_test_tfidf)
accuracy_score(y_test, pred)

0.6704206241519675

Stopwords 추가

In [39]:
# CountVectorizer에 Stopwords 추가
count_vect = CountVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)
count_vect.fit(X_train)
X_train_count = count_vect.transform(X_train)
X_test_count = count_vect.transform(X_test)

In [40]:
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_count, y_train)
pred = lr_clf.predict(X_test_count)
accuracy_score(y_test, pred)

0.6443690637720488

In [41]:
# TfidfVectorizer Stopwords 추가
tfidf_vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=300)
tfidf_vect.fit(X_train) 
X_train_tfidf = tfidf_vect.transform(X_train)
X_test_tfidf = tfidf_vect.transform(X_test)

In [45]:
# TfidfVectorizer - 로지스틱 회귀
lr_clf = LogisticRegression(C=10)
lr_clf.fit(X_train_tfidf, y_train)
pred = lr_clf.predict(X_test_tfidf)
accuracy_score(y_test, pred)

0.7153324287652646

In [46]:
# TfidfVectorizer - SVC
svc.fit(X_train_tfidf, y_train)
pred = svc.predict(X_test_tfidf)
accuracy_score(y_test, pred)

0.6902306648575305

### Pipeline과 GridSearchCV로 하이퍼 파라미터 추가하여 튜닝 <br>
### 이후 모델 만들기
1. Count Vectorizer - 로지스틱 회귀

In [47]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('count_vect', CountVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])

In [48]:
params = {
    'count_vect__max_df': [300, 700],
    'lr_clf__C': [1, 10]
}

In [49]:
from sklearn.model_selection import GridSearchCV

grid_pipe = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_pipe.fit(X_train, y_train)
print(grid_pipe.best_params_, grid_pipe.best_score_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 51.9min finished
{'count_vect__max_df': 300, 'lr_clf__C': 1} 0.7059305880934986


In [50]:
pred1 = grid_pipe.predict(X_test)

In [51]:
accuracy_score(y_test, pred1)

0.6622795115332428

In [52]:
import joblib
joblib.dump(grid_pipe, '../static/model/news20_countlr.pkl')

['../static/model/news20_countlr.pkl']

2. TfItf Vectorizer - 로지스틱 회귀

In [53]:
pipeline2 = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english', ngram_range=(1,2))),
    ('lr_clf', LogisticRegression())
])

In [54]:
params2 = {
    'tfidf_vect__max_df': [300, 700],
    'lr_clf__C': [1, 10]
}

In [55]:
grid_pipe2 = GridSearchCV(pipeline2, param_grid=params2, cv=3, scoring='accuracy', verbose=1)
grid_pipe2.fit(X_train, y_train)
print(grid_pipe2.best_params_, grid_pipe2.best_score_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  12 out of  12 | elapsed: 40.1min finished
{'lr_clf__C': 10, 'tfidf_vect__max_df': 700} 0.766132057480442


In [56]:
pred2 = grid_pipe2.predict(X_test)
accuracy_score(y_test, pred2)

0.7150610583446404

In [57]:
joblib.dump(grid_pipe2, '../static/model/news20_tfidflr.pkl')

['../static/model/news20_tfidflr.pkl']

3. TfItf Vectorizer - SVC

In [58]:
pipeline3 = Pipeline([
    ('tfidf_vect', TfidfVectorizer(stop_words='english')),
    ('svc', SVC())
])

In [59]:
params3 = {
    'tfidf_vect__ngram_range': [(1,1), (1,2)],
    'tfidf_vect__max_df': [300, 700],
    'svc__C': [1, 10]
}

In [60]:
grid_pipe3 = GridSearchCV(pipeline3, param_grid=params3, cv=3, scoring='accuracy', verbose=1)
grid_pipe3.fit(X_train, y_train)
print(grid_pipe3.best_params_, grid_pipe3.best_score_)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 30.8min finished
{'svc__C': 10, 'tfidf_vect__max_df': 300, 'tfidf_vect__ngram_range': (1, 1)} 0.7586518518323572


In [61]:
pred3 = grid_pipe3.predict(X_test)
accuracy_score(y_test, pred3)

0.6957937584803257

In [62]:
joblib.dump(grid_pipe3, '../static/model/news20_tfidfsvc.pkl')

['../static/model/news20_tfidfsvc.pkl']

Client에서 Server로 받은 index로 각각의 예측값(y햇)

In [63]:
index = 100

In [64]:
test_target = df_test.iloc[index, :-1].values

In [65]:
print(test_target,pred[0], pred2[0], pred3[0])

tive" position is that we should not\nsell these computers to the Soviets, because they could  use\nthem  in weapons systems.  The "liberal" position is that we\nshould sell them, in  the  interests  of  mutual  trade  and\ncooperation--and  anyway,  if  we don\'t make the sale, there\nwill certainly be some other nation willing to.\n\n     For my part, I\'m ready to suggest that the  Libertarian\nposition should be to give them to the Soviets for free, and\nif  necessary, make them take them . . . and if that doesn\'t\nwork load up an SR-71  Blackbird  and  air  drop  them  over\nMoscow in the middle of the night.  Paid for by private sub-\nscription, of course, not taxation . . . I confess that this\nis not a position that has gained much support among members\nof  the conventional left-right political spectrum, but, af-\nter all, in the words of one of Illuminatus\'s characters, we\nare political non-Euclideans:   The shortest distance  to  a\nparticular  goal may not look anything 