In [17]:
import os
import pandas as pd
from gensim import corpora, models, similarities, matutils
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [31]:
fname = os.path.join(
    'Data',
    'kickstarter_data.json'
)
df = pd.read_json(fname, 'records')
kickstarter_corpus = df.project_description.values

In [38]:
kwargs = dict(
    ngram_range=(1, 2),  
    stop_words='english',
    token_pattern="\\b[a-z][a-z]+\\b",
)
cv = CountVectorizer(**kwargs)
cv.fit(kickstarter_corpus)
counts = cv.transform(kickstarter_corpus).transpose()
print(counts.shape)
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

(15721, 2387)


In [45]:
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10)

In [46]:
lda.print_topics(num_words=7)

[(0,
  '0.004*"deck" + 0.004*"card" + 0.003*"new" + 0.003*"pc" + 0.003*"game" + 0.003*"playing card" + 0.002*"audio"'),
 (1,
  '0.005*"change" + 0.005*"collection" + 0.005*"help" + 0.004*"new" + 0.004*"world" + 0.003*"make" + 0.003*"foot"'),
 (2,
  '0.016*"enamel" + 0.010*"pin" + 0.008*"enamel pin" + 0.007*"pins" + 0.006*"enamel pins" + 0.005*"series" + 0.005*"designed"'),
 (3,
  '0.004*"game" + 0.004*"book" + 0.003*"life" + 0.003*"best" + 0.003*"use" + 0.003*"looking" + 0.003*"products"'),
 (4,
  '0.012*"pins" + 0.009*"enamel pins" + 0.009*"enamel" + 0.004*"hard" + 0.004*"hard enamel" + 0.004*"set" + 0.004*"inspired"'),
 (5,
  '0.004*"cats" + 0.004*"series" + 0.003*"album" + 0.003*"featuring" + 0.003*"unique" + 0.003*"comic" + 0.003*"sci fi"'),
 (6,
  '0.004*"rpg" + 0.004*"camera" + 0.004*"zine" + 0.003*"tabletop" + 0.003*"set" + 0.003*"dice" + 0.003*"old"'),
 (7,
  '0.004*"film" + 0.004*"custom" + 0.003*"help" + 0.003*"friends" + 0.003*"lifetime" + 0.003*"female" + 0.002*"world"'),
 