In [2]:
from sklearn.datasets import fetch_20newsgroups

In [37]:
# 20개의 토픽 중 선택하고자 하는 토픽을 리스트로 생성
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

newsgroups_test = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords 
cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

X_train = newsgroups_train.data 
y_train = newsgroups_train.target 

X_test = newsgroups_test.data 
y_test = newsgroups_test.target 

RegTok = RegexpTokenizer("[\w']{3,}")
english_stops = set(stopwords.words('english'))

def tokenizer(text):
    tokens = RegTok.tokenize(text.lower())
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2 ]
    features = (list(map(lambda token: PorterStemmer().stem(token), words)))
    return features

tfidf = TfidfVectorizer(tokenizer = tokenizer)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [38]:
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()
lr_clf.fit(X_train_tfidf, y_train)

print('#Train set score: {:.3f}'.format(lr_clf.score(X_train_tfidf, y_train)))
print('#Test set score: {:.3f}'.format(lr_clf.score(X_test_tfidf, y_test)))
# x_train의 경우 fit_transform
# x_test의 경우 transform

#Train set score: 0.962
#Test set score: 0.761


### PCA

In [None]:
type(X_train_tfidf)


In [43]:
from sklearn.decomposition import PCA 

# 차원을 2000차원으로 축소
pca = PCA(n_components=2000, random_state = 8)

# tf-idf
x_train_pca = pca.fit_transform(X_train_tfidf.toarray())
x_test_pca = pca.transform(X_test_tfidf.toarray())

print('Original tfidf matrix shape: {}'.format(X_train_tfidf.shape))
print('PCA Converted matrix shape: {}'.format(x_train_pca.shape))

Original tfidf matrix shape: (2034, 20085)
PCA Converted matrix shape: (2034, 2000)


In [45]:
# 차원이 축소되었음에도 불구하고 100% 설명이 가능하다. 
pca.explained_variance_ratio_.sum()

1.0000000000000002

In [49]:
import numpy as np 
lasso_clf = LogisticRegression(penalty='l1', solver='liblinear', C=1)
lasso_clf.fit(x_train_pca, y_train)

print(lasso_clf.score(x_train_pca, y_train))
print(lasso_clf.score(x_test_pca, y_test))

print('#Used features count: {}'.format(np.sum(lasso_clf.coef_ != 0)), 'out of', X_train_tfidf.shape[1])

0.8294001966568338
0.7420546932742055
#Used features count: 186 out of 20085


In [55]:
pca = PCA(n_components=np.sum(lasso_clf.coef_!=0), random_state=7)

x_train_pca = pca.fit_transform(X_train_tfidf.toarray())
x_test_pca = pca.transform(X_test_tfidf.toarray())

print('PCA Converted X shape: ', x_train_pca.shape)
print('Sum of explained variance ratio: {:.3f}'.format(pca.explained_variance_ratio_.sum()))

lr_clf.fit(x_train_pca, y_train)
print('#Train set score: {:.3f}'.format(lr_clf.score(x_train_pca, y_train)))
print('#Test set score: {:.3f}'.format(lr_clf.score(x_test_pca, y_test)))

PCA Converted X shape:  (2034, 186)
Sum of explained variance ratio: 0.316
#Train set score: 0.850
#Test set score: 0.743


In [56]:
pca = PCA(n_components=100, random_state=7)

x_train_pca = pca.fit_transform(X_train_tfidf.toarray())
x_test_pca = pca.transform(X_test_tfidf.toarray())

print('PCA Converted X shape: ', x_train_pca.shape)
print('Sum of explained variance ratio: {:.3f}'.format(pca.explained_variance_ratio_.sum()))

lr_clf.fit(x_train_pca, y_train)
print('#Train set score: {:.3f}'.format(lr_clf.score(x_train_pca, y_train)))
print('#Test set score: {:.3f}'.format(lr_clf.score(x_test_pca, y_test)))

PCA Converted X shape:  (2034, 100)
Sum of explained variance ratio: 0.211
#Train set score: 0.807
#Test set score: 0.738
