### Latent Semantic Analysis

In [2]:
# 20개의 토픽 중 선택하고자 하는 토픽을 리스트로 생성
from sklearn.datasets import fetch_20newsgroups
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

newsgroups_train = fetch_20newsgroups(subset='train',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

newsgroups_test = fetch_20newsgroups(subset='test',
                                      remove=('headers', 'footers', 'quotes'),
                                      categories=categories)

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.corpus import stopwords 
cachedStopWords = stopwords.words("english")

from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer

X_train = newsgroups_train.data 
y_train = newsgroups_train.target 

X_test = newsgroups_test.data 
y_test = newsgroups_test.target 

RegTok = RegexpTokenizer("[\w']{3,}")
english_stops = set(stopwords.words('english'))

def tokenizer(text):
    tokens = RegTok.tokenize(text.lower())
    words = [word for word in tokens if (word not in english_stops) and len(word) > 2 ]
    features = (list(map(lambda token: PorterStemmer().stem(token), words)))
    return features

tfidf = TfidfVectorizer(tokenizer = tokenizer)
x_train_tfidf = tfidf.fit_transform(X_train)
x_test_tfidf = tfidf.transform(X_test)

In [3]:
# TruncatedSVD -> LSA
# 절단된 SVD에서는 tfidf를 toarray()로 변환해주지 않더라도 인식한다. 
from sklearn.decomposition import TruncatedSVD 
svd = TruncatedSVD(n_components=2000, random_state=7) # latent_dim 
x_train_lsa = svd.fit_transform(x_train_tfidf)
x_test_lsa = svd.transform(x_test_tfidf)

In [4]:
print('LSA Converted X shape:', x_train_lsa.shape)
print(f'Sum of explained variance ratio: {svd.explained_variance_ratio_.sum():.3f}')

from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression()

lr_clf.fit(x_train_lsa, y_train)
print(f'#Train set score: {lr_clf.score(x_train_lsa, y_train):.3f}')
print(f'#Test set score: {lr_clf.score(x_test_lsa, y_test):.3f}')

LSA Converted X shape: (2034, 2000)
Sum of explained variance ratio: 1.000
#Train set score: 0.962
#Test set score: 0.761


In [5]:
svd = TruncatedSVD(n_components=100, random_state=7)

x_train_lsa = svd.fit_transform(x_train_tfidf)
x_test_lsa = svd.transform(x_test_tfidf)

lr_clf.fit(x_train_lsa, y_train)
print(f'Sum of explained variance ratio: {svd.explained_variance_ratio_.sum():.3f}')
print(f'#Train set score: {lr_clf.score(x_train_lsa, y_train):.3f}')
print(f'#Test set score: {lr_clf.score(x_test_lsa, y_test):.3f}')

Sum of explained variance ratio: 0.209
#Train set score: 0.810
#Test set score: 0.735


### LSA를 이용한 의미 기반의 문서 간 유사도 계산

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

print('#사용된 전체 카테고리:', newsgroups_train.target_names)
print('#첫 문서의 카테고리:', y_train[0])

# 변환된 count vector와 기존 값들과의 similarity 계산 
sim_result = cosine_similarity([x_train_lsa[0]], x_train_lsa) # 0번째 문서와 각 문서를 비교함.

print('#Top 20 유사도(lsa):\n', sorted(sim_result[0].round(2), reverse=True)[:20])
sim_index = (-sim_result[0]).argsort()[:20]
print('#Top 20 유사 뉴스의 인덱스(lsa):\n', sim_index)

sim_labels = [y_train[i] for i in sim_index]
print("#Top 20 유사 뉴스의 카테고리(lsa):\n", sim_labels)

#사용된 전체 카테고리: ['alt.atheism', 'comp.graphics', 'sci.space', 'talk.religion.misc']
#첫 문서의 카테고리: 1
#Top 20 유사도(lsa):
 [1.0, 0.75, 0.74, 0.72, 0.71, 0.7, 0.69, 0.69, 0.67, 0.67, 0.66, 0.66, 0.65, 0.65, 0.65, 0.6, 0.59, 0.58, 0.58, 0.57]
#Top 20 유사 뉴스의 인덱스(lsa):
 [   0 1957  501 1674 1995  790 1209  998 1892 1490 1728 1902 1038 1575
  892 1290 1826 1029 1089  651]
#Top 20 유사 뉴스의 카테고리(lsa):
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [7]:
sim_result = cosine_similarity(x_train_tfidf[0], x_train_tfidf)

print('#Top 20 유사도(lsa):\n', sorted(sim_result[0].round(2), reverse=True)[:20])
sim_index = (-sim_result[0]).argsort()[:20]
print('#Top 20 유사 뉴스의 인덱스(lsa):\n', sim_index)

sim_labels = [y_train[i] for i in sim_index]
print("#Top 20 유사 뉴스의 카테고리(lsa):\n", sim_labels)

#Top 20 유사도(lsa):
 [1.0, 0.3, 0.22, 0.21, 0.19, 0.19, 0.19, 0.17, 0.16, 0.16, 0.16, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.15, 0.14]
#Top 20 유사 뉴스의 인덱스(lsa):
 [   0 1575 1892 1490  501 1290 1013  998 1636 1705 1995 1957 1664  651
 1038  429 1089 1209 1728 1803]
#Top 20 유사 뉴스의 카테고리(lsa):
 [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [20]:
svd = TruncatedSVD(n_components=10, random_state=1)
x_train_lsa = svd.fit_transform(x_train_tfidf)
x_test_lsa = svd.transform(x_test_tfidf)

print('LSA Converted X shape:', x_train_lsa.shape)
print(f'Sum of explained variance ratio: {svd.explained_variance_ratio_.sum():.3f}')

# 각 index에 위치하는 이름을 반환해준다. 
# e.g. 1:would, 2:one ... dictionary
terms = tfidf.get_feature_names() # 기존의 책에서는 get_feature_names_out 이지만 version차이가 있는듯.. 
def get_topics(model, feature_names, n=10):
    for idx, topic in enumerate(model.components_): # components: 요소
        print("Topic %d:" % (idx+1), [feature_names[i] for i in topic.argsort()[:-n-1:-1]])
get_topics(svd, terms)

LSA Converted X shape: (2034, 10)
Sum of explained variance ratio: 0.045
Topic 1: ['would', 'one', 'god', 'think', 'use', 'peopl', 'know', 'like', 'say', 'space']
Topic 2: ['file', 'imag', 'thank', 'program', 'graphic', 'space', 'format', 'use', 'color', 'ftp']
Topic 3: ['space', 'orbit', 'nasa', 'launch', 'shuttl', 'satellit', 'year', 'moon', 'lunar', 'cost']
Topic 4: ['moral', 'object', 'system', 'valu', 'goal', 'think', 'anim', 'absolut', 'natur', 'defin']
Topic 5: ['ico', 'bobb', 'tek', 'beauchain', 'bronx', 'manhattan', 'sank', 'queen', 'vice', 'blew']
Topic 6: ['god', 'file', 'imag', 'object', 'moral', 'exist', 'space', 'format', 'system', 'color']
Topic 7: ['file', 'islam', 'imag', 'cview', 'use', 'format', 'color', 'muslim', 'religion', 'peopl']
Topic 8: ['post', 'file', 'space', 'islam', 'read', 'cview', 'format', 'articl', 'group', 'moral']
Topic 9: ['christian', 'graphic', 'imag', 'jesu', 'book', 'data', 'group', 'softwar', 'law', 'code']
Topic 10: ['exist', 'atheism', 'athe

In [64]:
import numpy as np
tfidf = TfidfVectorizer(max_features=1000, min_df=5, max_df=0.5)
x_train_tfidf = tfidf.fit_transform(X_train)

svd = TruncatedSVD(n_components=100, random_state=1)
x_train_lsa = svd.fit_transform(x_train_tfidf)

print('#components_의 shape:', svd.components_.shape)
print('#singular_values_의 shape:', svd.singular_values_.shape)

# word의 singular_values를 보고 싶으면?
t_words = np.diag(svd.singular_values_).dot(svd.components_).T
print('#변환된 단어-잠재의미 행렬의 shape:', t_words.shape)

# t_words에서 space에 해당하는 벡터를 가져옴
source = t_words[tfidf.get_feature_names().index('space')]

# 변환된 count vector와 기존 값들과의 similarity 계산
sim_result = cosine_similarity([source], t_words)

print('#Top 20 유사도(tfidf):', sorted(sim_result[0].round(2), reverse=True)[:20])
sim_index = (-sim_result[0]).argsort()[:20]
print('#Top 20 유사 뉴스의 인덱스(tfidf):', sim_index)
sim_labels = [tfidf.get_feature_names()[i] for i in sim_index]
print('#Top 20 유사 뉴스의 카테고리(tfidf):', sim_labels)


#components_의 shape: (100, 1000)
#singular_values_의 shape: (100,)
#변환된 단어-잠재의미 행렬의 shape: (1000, 100)
#Top 20 유사도(tfidf): [1.0, 0.73, 0.72, 0.69, 0.66, 0.58, 0.56, 0.56, 0.54, 0.54, 0.52, 0.52, 0.52, 0.51, 0.51, 0.5, 0.5, 0.49, 0.48, 0.47]
#Top 20 유사 뉴스의 인덱스(tfidf): [812 314 754 829 594 679 720 650 785 565 101 435 606 545 854 746 669 856
 611 564]
#Top 20 유사 뉴스의 카테고리(tfidf): ['space', 'exploration', 'sci', 'station', 'office', 'propulsion', 'reports', 'planetary', 'shuttle', 'national', 'astro', 'international', 'operations', 'missions', 'technical', 'satellites', 'probes', 'telescope', 'orbiter', 'nasa']


In [63]:
np.where(tfidf.get_feature_names().index('space'))


(array([0], dtype=int64),)