In [None]:
%load_ext autoreload
%autoreload 2

In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import pandas as pd
from tqdm import tqdm
import spacy
from gensim import corpora, models
import os
os.chdir("Documents/GitHub/Topic-Mapping")
print(os.getcwd())
from src.utils import preprocess, get_windows

import requests
requests.packages.urllib3.disable_warnings()
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    # Legacy Python that doesn't verify HTTPS certificates by default
    pass
else:
    # Handle target environment that doesn't support HTTPS verification
    ssl._create_default_https_context = _create_unverified_https_context

/Users/carterward/Documents/GitHub/Topic-Mapping


In [2]:
MIN_COUNTS = 20
MAX_COUNTS = 1800
# words with count < MIN_COUNTS
# and count > MAX_COUNTS
# will be removed

MIN_LENGTH = 15
# minimum document length 
# (number of words)
# after preprocessing

# half the size of the context around a word
HALF_WINDOW_SIZE = 5
# it must be that 2*HALF_WINDOW_SIZE < MIN_LENGTH

# Load NLP model

In [3]:
nlp = spacy.load('en')

# Load dataset

In [4]:
# dataset = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))
# docs = dataset['data']
dataset = pd.read_csv("data/song_df.csv")
docs = dataset["tokenized_lyrics"].values
docs = [doc for doc in docs if not pd.isna(doc)]

In [5]:
type(docs[0])

str

In [6]:
# number of documents
len(docs)

9568

In [7]:
# store an index with a document
docs = [(i, doc) for i, doc in enumerate(docs)]

# Preprocess dataset and create windows

In [8]:
docs

ad backache head feel new form night stick ice pack sit maniac pound muscle horn remember smilin kick mud bright light recall cussin lose control send high flight money fame cause throw ground bloody reason walk away stumble pick joint pop different place know highway track mind stick replay start real mad lose track buckle big check remember time land pile b.s money fame cause throw ground bloody reason walk away money fame cause throw ground bloody reason walk away walk away walk away walk away'),
 (966,
  'laughin talkin window shopping new guy hear hear find right goodbye come real close lose temper bit tongue cool remember care matter love anymore know slip mind tell time guess forget second care 10:00 know car park drive pull knock door piece mind drivin place wonderin far way care matter love anymore know slip mind tell time guess forget second care home light blinkin old machine brother town leave miss ring care matter love mind hopin know late drivin care care'),
 (967,
  'hom

In [10]:
encoded_docs, decoder, word_counts = preprocess(
    docs, nlp, MIN_LENGTH, MIN_COUNTS, MAX_COUNTS
)

100%|██████████| 9568/9568 [00:00<00:00, 57236.19it/s]
number of removed short documents: 94
total number of tokens: 1102117
number of tokens to be removed: 481116
number of additionally removed short documents: 354
total number of tokens: 617309

minimum word count number: 17
this number can be less than MIN_COUNTS because of document removal


In [11]:
# new ids will be created for the documents.
# create a way of restoring initial ids:
doc_decoder = {i: doc_id for i, (doc_id, doc) in enumerate(encoded_docs)}

In [12]:
data = []
# new ids are created here
for index, (_, doc) in tqdm(enumerate(encoded_docs)):
    windows = get_windows(doc, HALF_WINDOW_SIZE)
    # index represents id of a document, 
    # windows is a list of (word, window around this word),
    # where word is in the document
    data += [[index, w[0]] + w[1] for w in windows]

data = np.array(data, dtype='int64')

9120it [00:01, 7474.45it/s]


In [13]:
# a row in 'data' contains:
# id of a document, id of a word in this document, a window around this word
# 1 + 1 + 10
data.shape[1]

12

In [14]:
# number of windows (equals to the total number of tokens)
data.shape[0]

617309

# Get unigram distribution

In [15]:
word_counts = np.array(word_counts)
unigram_distribution = word_counts/sum(word_counts)

# Prepare word vectors

In [16]:
%%time
vocab_size = len(decoder)
embedding_dim = 50

# train a skip-gram word2vec model
texts = [[str(j) for j in doc] for i, doc in encoded_docs]
model = models.Word2Vec(texts, size=embedding_dim, window=5, workers=4, sg=1, negative=15, iter=70)
model.init_sims(replace=True)

word_vectors = np.zeros((vocab_size, embedding_dim)).astype('float32')
for i in decoder:
    word_vectors[i] = model.wv[str(i)]

CPU times: user 11min 9s, sys: 4.69 s, total: 11min 13s
Wall time: 2min 59s


In [17]:
# number of unique words
vocab_size

4484

# Prepare initialization for document weights

In [18]:
texts = [[decoder[j] for j in doc] for i, doc in encoded_docs]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

In [19]:
%%time
n_topics = 25
lda = models.LdaModel(corpus, alpha=0.9, id2word=dictionary, num_topics=n_topics)
corpus_lda = lda[corpus]

CPU times: user 6.07 s, sys: 74.8 ms, total: 6.15 s
Wall time: 6.24 s


In [20]:
for i, topics in lda.show_topics(n_topics, formatted=False):
    print('topic', i, ':', ' '.join([t for t, _ in topics]))

topic 0 : bear help easy shame send wild street soul door sun
topic 1 : sick ask sunshine problem lookin news wake summer babe black
topic 2 : hell kiss half kind skin cold country booty bom slip
topic 3 : jump jesus roll straight young step single somebody peace problem
topic 4 : rock bye crazy number double bump fast hustle aye roll
topic 5 : dog shake lonely star clear movin roll remember solo body
topic 6 : miss kill fun poor band bang d horse bill true
topic 7 : shine fight catch sign window river blue word heat dancin
topic 8 : christmas gimme air happy oo glad true birthday fallin road
topic 9 : burn pum king belong pa game rum hair perfect city
topic 10 : party city lover sky ass enemy nah mornin going boom
topic 11 : sing lord forever joy count power follow bring lift pain
topic 12 : fly hoo de set lonely eat babe sweet fire hole
topic 13 : rain town wish sweet hurt alright truth arm touch beautiful
topic 14 : hot smoke black shoot water rise breathe dogg pop cop
topic 15 : de

In [21]:
doc_weights_init = np.zeros((len(corpus_lda), n_topics))
for i in tqdm(range(len(corpus_lda))):
    topics = corpus_lda[i]
    for j, prob in topics:
        doc_weights_init[i, j] = prob

100%|██████████| 9120/9120 [00:05<00:00, 1783.95it/s]


# Save data

In [25]:
np.save('model/lyrics/data.npy', data)
np.save('model/lyrics/word_vectors.npy', word_vectors)
np.save('model/lyrics/unigram_distribution.npy', unigram_distribution)
np.save('model/lyrics/decoder.npy', decoder)
np.save('model/lyrics/doc_decoder.npy', doc_decoder)
np.save('model/lyrics/doc_weights_init.npy', doc_weights_init)