In [1]:
import os
import pandas as pd
from gensim import corpora, models, similarities, matutils
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [2]:
fname = os.path.join(
    'Data',
    'kickstarter_data.json'
)
df = pd.read_json(fname, 'records')

In [3]:
df.loc[df.story.isna(), 'url'].head()

570                https://www.kickstarter.com/projects/1198151177/brooklyns-journey
571              https://www.kickstarter.com/projects/foodasfunctionalart/state-dish
572    https://www.kickstarter.com/projects/oido/oido-optical-illusion-desk-ornament
573                   https://www.kickstarter.com/projects/jwasher/the-other-side-13
574       https://www.kickstarter.com/projects/floorsfilm/if-these-floors-could-talk
Name: url, dtype: object

In [4]:
total_pitches = df.shape[0]
pitches_with_stories = df.loc[~df.story.isna(), ].shape[0]
percent_with_story = pitches_with_stories / total_pitches
args = (percent_with_story, pitches_with_stories, total_pitches)
print('{:.1%} of pitches ({:0,.0f} out of {:0,.0f} total) have stories'.format(*args))

95.4% of pitches (2,274 out of 2,384 total) have stories


In [5]:
"""
We''ll fill the remaining pitches with their project description
"""
df.story = df.story.fillna(df.project_description)

In [6]:
kickstarter_corpus = df.story.values
kwargs = dict(
    ngram_range=(1, 2),  
    stop_words='english',
    token_pattern="\\b[a-z][a-z]+\\b",
)
cv = CountVectorizer(**kwargs)
cv.fit(kickstarter_corpus)
counts = cv.transform(kickstarter_corpus).transpose()
print(counts.shape)
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

(631174, 2384)


In [7]:
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10)

In [8]:
lda.print_topics(num_words=7)

[(0,
  '0.005*"que" + 0.005*"la" + 0.004*"und" + 0.004*"die" + 0.004*"en" + 0.004*"el" + 0.003*"es"'),
 (1,
  '0.004*"iron" + 0.003*"pans" + 0.002*"cookware" + 0.002*"wrought" + 0.002*"solidteknics" + 0.001*"cast iron" + 0.001*"cast"'),
 (2,
  '0.001*"space pen" + 0.001*"pensv" + 0.000*"fisher space" + 0.000*"tinkle" + 0.000*"tinkle topper" + 0.000*"topper" + 0.000*"cigarettes"'),
 (3,
  '0.007*"book" + 0.004*"game" + 0.004*"cards" + 0.003*"art" + 0.003*"new" + 0.003*"world" + 0.002*"kickstarter"'),
 (4,
  '0.001*"compatible" + 0.001*"tested" + 0.001*"verified" + 0.001*"tested verified" + 0.001*"primes" + 0.001*"lens" + 0.001*"expected"'),
 (5,
  '0.000*"sauce" + 0.000*"chat daddy" + 0.000*"daddy" + 0.000*"chat" + 0.000*"sternum strap" + 0.000*"sternum" + 0.000*"honbinos"'),
 (6,
  '0.000*"featherknight" + 0.000*"inch hard" + 0.000*"plating black" + 0.000*"comes backing" + 0.000*"black plating" + 0.000*"pin black" + 0.000*"pin comes"'),
 (7,
  '0.000*"mw" + 0.000*"generation" + 0.000*"