In [14]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB

In [15]:
fname = os.path.join(
    'Data',
    'kickstarter_data.json'
)
df = pd.read_json(fname, 'records')

In [16]:
total_pitches = df.shape[0]
pitches_with_backer_data = df.loc[df.goal_and_pledged_backers != '', ].shape[0]
percent_with_backer_data = pitches_with_backer_data / total_pitches
args = (percent_with_backer_data, pitches_with_backer_data, total_pitches)
print('{:.1%} of pitches ({:0,.0f} out of {:0,.0f} total) have backer data'.format(*args))

51.4% of pitches (1,227 out of 2,385 total) have backer data


In [18]:
df.loc[df.goal_and_pledged_backers == '', 'url'].head()

1                         https://www.kickstarter.com/projects/pincessemmifer/wendy-darling-fantasy-pin
3                                 https://www.kickstarter.com/projects/totteoki/manuscript-planner-2020
5                       https://www.kickstarter.com/projects/jrmagnetics/rf-broadband-noise-generator-2
9     https://www.kickstarter.com/projects/whiskeydickgame/whiskey-dick-the-x-rated-adult-drinking-game
16                          https://www.kickstarter.com/projects/garymarx/oil-painting-and-wine-sipping
Name: url, dtype: object

In [9]:
"""
Remove pitches without any goal, pledged and backer data
"""
mask = df.goal_and_pledged_backers != ''
df = (df.loc[mask, :]
    .reset_index()
)

In [10]:
pat = r'(?P<pledged>.+)\npledged of (?P<goal>.+) goal\n[,\d]+\nbackers?'
df[['pledged', 'goal']] = df.goal_and_pledged_backers.str.extract(pat)

In [11]:
"""
Convert pledged and goals into respective currencies and amount
Output to csv to check in excel
"""
df = df.assign(
    pledged_currency=lambda x: x.pledged.str.extract('([^\d]+)'),
    pledged_amount=lambda x: x.pledged.astype(str).map(lambda x: ''.join([i for i in x if i.isdigit()])),
    goal_currency=lambda x: x.goal.str.extract('([^\d]+)'),
    goal_amount=lambda x: x.goal.astype(str).map(lambda x: ''.join([i for i in x if i.isdigit()])).astype(int),
)
columns = [
    'goal_and_pledged_backers',
    'pledged',
    'pledged_currency',
    'pledged_amount',
    'goal',
    'goal_currency',
    'goal_amount',
]
test_fname = os.path.join(
    'Data',
    'ks_goal_pledged_test.csv'
)
(df.loc[:, columns]
    .to_csv(test_fname)
)

In [12]:
df.loc[:, 'success'] = np.where(df.pledged_amount >= df.goal_amount, 1, 0)

TypeError: '>=' not supported between instances of 'str' and 'int'

In [79]:
df.groupby('success').size()

success
0      99
1    1229
dtype: int64

In [72]:
documents = df.project_description.values
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
doc_vectors = vectorizer.fit_transform(documents)

In [73]:
model = MultinomialNB().fit(doc_vectors, df.loc[:, 'success'])

In [76]:
model.predict(doc_vectors).sum()

1328

In [81]:
"""
Currently the model is only predicted success'
Need to get more failures
"""

"\nCurrently the model is only predicted success'\nNeed to get more failures\n"