<a href="https://colab.research.google.com/github/abs-git/NLP/blob/main/English_Doc_Embedding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
# glue data load

path = '/content/gdrive/MyDrive/Colab Notebooks/NLP'

with open(path + '/glue_train.txt', 'r') as f:
  train_sentences = f.readlines()

for i, sen in enumerate(train_sentences):
  train_sentences[i] = sen.rstrip("\n")

with open(path + '/glue_test.txt', 'r') as f:
  test_sentences = f.readlines()

for i, sen in enumerate(test_sentences):
  test_sentences[i] = sen.rstrip("\n")

print("train : {}".format(len(train_sentences)))
print("test : {}".format(len(test_sentences)))

print(train_sentences[:2])
print(test_sentences[:2])



train : 8551
test : 1063
["Our friends won't buy this analysis, let alone the next one we propose.", "One more pseudo generalization and I'm giving up."]
['Bill whistled past the house.', 'The car honked its way down the road.']


# Frequency based embedding

### LSA (Latent Semantic Analysis)

In [9]:
# LSA (Latent Semantic Analysis, 잠재 의미 분석)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD


sens = []
for sen in train_sentences[:100]:
  sens.append(sen.lower())

vectorizer = TfidfVectorizer(stop_words='english', max_features = 100, max_df = 0.5, smooth_idf = True)

X = vectorizer.fit_transform(sens)

words = vectorizer.get_feature_names()

print(X.shape)  # (Number of sentence, Number of words)

svd = TruncatedSVD(n_components=20, algorithm= 'randomized', n_iter = 10, random_state = 100)

svd.fit(X)

print(svd.components_.shape)  # V_t의 shape (특이값, vector size)

k = 10
components = svd.components_
for idx, topic in enumerate(components):
  top_k_topics = [words[i] for i in topic.argsort()[: -k -1 : -1]]

  print(top_k_topics)



(100, 100)
(20, 100)
['coughed', 'harry', 'awake', 'fit', 'pushed', 'yelled', 'nose', 'hours', 'john', 'hoarse']
['building', 'taller', 'got', 'wider', 'tall', 'free', 'sam', 'replacement', 'cried', 'fred']
['road', 'forest', 'rumbled', 'wagon', 'witch', 'vanished', 'went', 'vanishing', 'tunnel', 'trolley']
['yelled', 'hoarse', 'harry', 'hours', 'pushed', 'trail', 'floated', 'cave', 'river', 'day']
['sleep', 'cried', 'sue', 'sang', 'fred', 'verbs', 'study', 'tracked', 'source', 'bled']
['loose', 'wriggled', 'tooth', 'carpet', 'worm', 'cried', 'bled', 'day', 'tiger', 'angry']
['floated', 'hours', 'pushed', 'harry', 'cave', 'river', 'trail', 'fit', 'sang', 'want']
['pulley', 'rope', 'stretched', 'weights', 'cried', 'day', 'sd', 'sf', 'flowers', 'gardener']
['flat', 'metal', 'watered', 'gardener', 'flowers', 'fred', 'source', 'tracked', 'cried', 'day']
['drank', 'pub', 'cried', 'bled', 'tiger', 'day', 'fred', 'way', 'angry', 'sf']
['floated', 'cave', 'river', 'awake', 'yelled', 'john', 'n



### LDA (Latent Dirichlet Allocation)

In [9]:
# LDA (Latent Dirichlet Allocation, 잠재 디리클레 할당)

import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation


# 불용어 제거, 토큰화
stop_words = stopwords.words('english')

sen_to_tokens = []
for sen in train_sentences[:1000]:
  sen = sen.lower()
  tokens = nltk.word_tokenize(sen)

  temp = []
  for token in tokens:
    if token not in stop_words:
      temp.append(token)

  sen_to_tokens.append(temp)

print(sen_to_tokens[:5])


# 역 토큰화 후 TF-IDF
detokenized = []
for tokens in sen_to_tokens:
  sen = ' '.join(tokens)
  detokenized.append(sen)

vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 1000)
tf_idf_mat = vectorizer.fit_transform(detokenized)

print(tf_idf_mat.shape)


# LDA

nTopics = 5

lda = LatentDirichletAllocation(n_components = nTopics, learning_method='online', random_state = 100, max_iter = 1)
lda_topics = lda.fit_transform(tf_idf_mat)

print(lda.components_.shape)

terms_names = vectorizer.get_feature_names()
top_n = 5
for idx, topic in enumerate(lda.components_):

  top_list = []
  for i in topic.argsort()[:-top_n -1 : -1]:
    top_list.append((terms_names[i], round(topic[i], 2)))

  print(top_list)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[['friends', 'wo', "n't", 'buy', 'analysis', ',', 'let', 'alone', 'next', 'one', 'propose', '.'], ['one', 'pseudo', 'generalization', "'m", 'giving', '.'], ['one', 'pseudo', 'generalization', "'m", 'giving', '.'], ['study', 'verbs', ',', 'crazier', 'get', '.'], ['day', 'day', 'facts', 'getting', 'murkier', '.']]
(1000, 996)
(5, 996)
[('mary', 22.95), ('john', 8.7), ('eat', 7.82), ('think', 5.97), ('likes', 5.56)]
[('sally', 8.38), ('love', 6.86), ('problem', 5.61), ('ll', 4.43), ('joe', 4.06)]
[('john', 5.69), ('water', 5.19), ('leave', 4.14), ('baby', 4.01), ('cup', 3.97)]
[('john', 24.12), ('mary', 12.84), ('book', 10.86), ('ball', 7.94), ('left', 7.24)]
[('consider', 5.19), ('man', 5.05), ('talks', 4.34), ('garden', 4.21), ('lobbyists', 4.18)]




# Prediction based embedding

In [None]:
# Sent2Vec (Sentence to vector)

In [None]:
# Doc2Vec (Document to vector)