In [1]:
import os
import pandas as pd
from gensim import corpora, models, similarities, matutils
from sklearn import datasets
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans

In [2]:
fname = os.path.join(
    'Data',
    'kickstarter_data.json'
)
df = pd.read_json(fname, 'records')

In [3]:
total_pitches = df.shape[0]
pitches_with_stories = df.loc[~df.story.isna(), ].shape[0]
percent_with_story = pitches_with_stories / total_pitches
args = (percent_with_story, pitches_with_stories, total_pitches)
print('{:.1%} of pitches ({:0,.0f} out of {:0,.0f} total) have stories'.format(*args))

95.4% of pitches (2,274 out of 2,384 total) have stories


In [4]:
"""
We''ll fill the remaining pitches with their project description
"""
df.story = df.story.fillna(df.project_description)

In [5]:
kickstarter_corpus = df.story.values
kwargs = dict(
    ngram_range=(1, 2),  
    stop_words='english',
    token_pattern="\\b[a-z][a-z]+\\b",
)
cv = CountVectorizer(**kwargs)
cv.fit(kickstarter_corpus)
counts = cv.transform(kickstarter_corpus).transpose()
print(counts.shape)
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

(631174, 2384)


In [6]:
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10)
lda.print_topics(num_words=20)

[(0,
  '0.002*"brown" + 0.002*"wallet" + 0.002*"cards" + 0.002*"black" + 0.002*"available" + 0.001*"cash" + 0.001*"blue" + 0.001*"credit" + 0.001*"black brown" + 0.001*"available black" + 0.001*"red" + 0.001*"leather" + 0.001*"card" + 0.001*"redskins" + 0.001*"cowboys" + 0.001*"brown blue" + 0.001*"slots" + 0.001*"grey" + 0.001*"credit cards" + 0.001*"available brown"'),
 (1,
  '0.001*"stove" + 0.000*"youth culture" + 0.000*"lash" + 0.000*"sternum strap" + 0.000*"sternum" + 0.000*"fuel" + 0.000*"core" + 0.000*"puk" + 0.000*"core puk" + 0.000*"cis" + 0.000*"lash binder" + 0.000*"museum youth" + 0.000*"binder" + 0.000*"pot" + 0.000*"shart" + 0.000*"taped" + 0.000*"aurora" + 0.000*"simmer" + 0.000*"core aurora" + 0.000*"youth"'),
 (2,
  '0.003*"use" + 0.002*"design" + 0.002*"product" + 0.002*"skin" + 0.002*"water" + 0.002*"products" + 0.002*"watch" + 0.002*"oil" + 0.002*"bag" + 0.002*"table" + 0.002*"strap" + 0.001*"easy" + 0.001*"used" + 0.001*"technology" + 0.001*"time" + 0.001*"using" 

In [7]:
print('Topic 5 is French and Spanish let\'s get those out of here!')

"\nTopic 5 is French and Spanish let's get those out of here!\n"

In [8]:
other_language_mask = df.story.str.contains(' le | la | je | das ') # picked a subset of words from other languages to filter with
print("There are {} projects in different languages".format(df.loc[other_language_mask, ].shape[0]))
df = df.loc[~other_language_mask, ]

There are 125 projects in different languages


In [9]:
kickstarter_corpus = df.story.values
kwargs = dict(
    ngram_range=(1, 2),  
    stop_words='english',
    token_pattern="\\b[a-z][a-z]+\\b",
)
cv = CountVectorizer(**kwargs)
cv.fit(kickstarter_corpus)
counts = cv.transform(kickstarter_corpus).transpose()
print(counts.shape)
corpus = matutils.Sparse2Corpus(counts)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=10)
lda.print_topics(num_words=20)

(574379, 2259)


[(0,
  '0.004*"game" + 0.003*"new" + 0.003*"time" + 0.003*"make" + 0.003*"like" + 0.003*"book" + 0.003*"kickstarter" + 0.003*"project" + 0.003*"help" + 0.003*"want" + 0.003*"world" + 0.002*"just" + 0.002*"film" + 0.002*"work" + 0.002*"design" + 0.002*"people" + 0.002*"life" + 0.002*"play" + 0.002*"cards" + 0.002*"campaign"'),
 (1,
  '0.001*"bamboo" + 0.001*"fact bamboo" + 0.000*"fun fact" + 0.000*"gaxmoor" + 0.000*"undies" + 0.000*"tea" + 0.000*"delorean" + 0.000*"lost city" + 0.000*"elysium" + 0.000*"noise" + 0.000*"world arena" + 0.000*"bum" + 0.000*"fart" + 0.000*"uball" + 0.000*"gong fu" + 0.000*"gong" + 0.000*"arena issue" + 0.000*"fu" + 0.000*"gygax" + 0.000*"city gaxmoor"'),
 (2,
  '0.001*"fact check" + 0.000*"extra spicy" + 0.000*"nz" + 0.000*"duck" + 0.000*"zorro" + 0.000*"brown" + 0.000*"guayusa" + 0.000*"check book" + 0.000*"mount starling" + 0.000*"starling" + 0.000*"tremolo" + 0.000*"nexus drawing" + 0.000*"drawing pad" + 0.000*"nexus" + 0.000*"mordux" + 0.000*"kittens" + 

In [10]:
# Load the LDA model from sk-learn
from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([words[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 5
number_words = 10
# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(counts)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, cv, number_words)

Topics found via LDA:

Topic #0:
abortion reporting able come absolutely fine acadia black access intended ability magnetize access able download ability activate accelerates opposite

Topic #1:
absolute best able ship accessibility sustainable abilities ranked access studio able listen access clientele able share accelerator consensys access anime

Topic #2:
absurd comedy academy headmaster able attend abandoned disenfranchised abrupt change abs post able remove abomination accept people abilities aren

Topic #3:
access enhances able endure abbotsford smoking abiding accelerations spin abilities protect abilities willencrad accents really access discord absolute comics

Topic #4:
abbotsford local abstract theme ability match able reference access camera ability design abuse doesn able colors abandoned homes ability replicating


In [11]:
%%time
from pyLDAvis import sklearn as sklearn_lda
import pickle 
import pyLDAvis
LDAvis_data_filepath = os.path.join('ldavis_prepared_'+str(number_topics))

LDAvis_prepared = sklearn_lda.prepare(lda, counts, cv)
with open(LDAvis_data_filepath, 'w') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath) as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')

AssertionError: Term frequencies and vocabulary are of different sizes, 2259 != 574379.