### Latent Semantic Analysis

In [7]:
# 20개의 토픽 중 선택하고자 하는 토픽을 리스트로 생성
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

newsgroups_test = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords 
cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

X_train = newsgroups_train.data 
y_train = newsgroups_train.target 

X_test = newsgroups_test.data 
y_test = newsgroups_test.target 

RegTok = RegexpTokenizer("[\w']{3,}")
english_stops = set(stopwords.words('english'))

def tokenizer(text):
    tokens = RegTok.tokenize(text.lower())
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2 ]
    features = (list(map(lambda token: PorterStemmer().stem(token), words)))
    return features

tfidf = TfidfVectorizer(tokenizer = tokenizer)
x_train_tfidf = tfidf.fit_transform(X_train)
x_test_tfidf = tfidf.transform(X_test)

In [8]:
# TruncatedSVD -> LSA
# 절단된 SVD에서는 tfidf를 toarray()로 변환해주지 않더라도 인식한다. 
from sklearn.decomposition import TruncatedSVD 
svd = TruncatedSVD(n_components=2000, random_state=7) # latent_dim 
x_train_lsa = svd.fit_transform(x_train_tfidf)
x_test_lsa = svd.transform(x_test_tfidf)

In [None]:
print('LSA Converted X shape:', x_train_lsa.shape)
print(f'Sum of explained variance ratio: {svd.explained_variance_ratio_.sum():.3f}')

from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()

lr_clf.fit(x_train_lsa, y_train)
print(f'#Train set score: {lr_clf.score(x_train_lsa, y_train):.3f}')
print(f'#Test set score: {lr_clf.score(x_test_lsa, y_test):.3f}')

LSA Converted X shape: (2034, 100)
Sum of explained variance ratio: 0.2086101619797387
#Train set score: 0.810
#Test set score: 0.735


In [None]:
svd = TruncatedSVD(n_components=100, random_state=7)

x_train_lsa = svd.fit_transform(x_train_tfidf)
x_test_lsa = svd.transform(x_test_tfidf)

lr_clf.fit(x_train_lsa, y_train)
print(f'Sum of explained variance ratio: {svd.explained_variance_ratio_.sum():.3f}')
print(f'#Train set score: {lr_clf.score(x_train_lsa, y_train):.3f}')
print(f'#Test set score: {lr_clf.score(x_test_lsa, y_test):.3f}')

NameError: name 'TruncatedSVD' is not defined

### LSA를 이용한 의미 기반의 문서 간 유사도 계산

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print('#사용된 전체 카테고리:', newsgroups_train.target_names)
print('#첫 문서의 카테고리:', y_train[0])

# 변환된 count vector와 기존 값들과의 similarity 계산 
sim_result = cosine_similarity([x_train_lsa[0]])

In [None]:
x_train_lsa[0]

NameError: name 'x_train_lsa' is not defined