In [1]:
import os
import pandas as pd
from gensim import corpora, models, similarities, matutils
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [2]:
fname = os.path.join(
    'Data',
    'kickstarter_data.json'
)
df = pd.read_json(fname, 'records')

In [3]:
df.loc[df.story.isna(), 'url'].head()

570                https://www.kickstarter.com/projects/1198151177/brooklyns-journey
571              https://www.kickstarter.com/projects/foodasfunctionalart/state-dish
572    https://www.kickstarter.com/projects/oido/oido-optical-illusion-desk-ornament
573                   https://www.kickstarter.com/projects/jwasher/the-other-side-13
574       https://www.kickstarter.com/projects/floorsfilm/if-these-floors-could-talk
Name: url, dtype: object

In [4]:
total_pitches = df.shape[0]
pitches_with_stories = df.loc[~df.story.isna(), ].shape[0]
percent_with_story = pitches_with_stories / total_pitches
args = (percent_with_story, pitches_with_stories, total_pitches)
print('{:.1%} of pitches ({:0,.0f} out of {:0,.0f} total) have stories'.format(*args))

95.4% of pitches (2,274 out of 2,384 total) have stories


In [4]:
"""
We''ll fill the remaining pitches with their project description
"""
df.story = df.story.fillna(df.project_description)

In [5]:
kickstarter_corpus = df.story.values
kwargs = dict(
    ngram_range=(1, 2),  
    stop_words='english',
    token_pattern="\\b[a-z][a-z]+\\b",
)
cv = CountVectorizer(**kwargs)
cv.fit(kickstarter_corpus)
counts = cv.transform(kickstarter_corpus).transpose()
print(counts.shape)
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

(240356, 2385)


In [6]:
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10)

In [7]:
lda.print_topics(num_words=7)

[(0,
  '0.004*"la" + 0.004*"el" + 0.004*"en" + 0.004*"que" + 0.002*"una" + 0.002*"es" + 0.001*"para"'),
 (1,
  '0.003*"time" + 0.003*"game" + 0.003*"new" + 0.003*"make" + 0.003*"like" + 0.003*"kickstarter" + 0.003*"book"'),
 (2,
  '0.001*"ditko" + 0.000*"comic" + 0.000*"hokey" + 0.000*"hokey pokey" + 0.000*"pokey" + 0.000*"comic book" + 0.000*"book"'),
 (3,
  '0.001*"chat" + 0.001*"chat daddy" + 0.001*"daddy" + 0.001*"nya" + 0.001*"dog" + 0.001*"coat" + 0.001*"dog coat"'),
 (4,
  '0.001*"tray" + 0.001*"pin" + 0.001*"enamel" + 0.001*"organizer" + 0.001*"minis" + 0.001*"enamel pin" + 0.000*"card"'),
 (5,
  '0.002*"moss" + 0.001*"desk" + 0.001*"tamberlane" + 0.001*"letter" + 0.000*"letter desk" + 0.000*"seat" + 0.000*"storage"'),
 (6,
  '0.004*"pins" + 0.002*"pin" + 0.002*"enamel" + 0.002*"kickstarter" + 0.001*"pledge" + 0.001*"unlocked" + 0.001*"hard enamel"'),
 (7,
  '0.001*"pins" + 0.001*"enamel" + 0.001*"featuring" + 0.001*"series" + 0.000*"wan" + 0.000*"enamel pins" + 0.000*"denim"')