In [20]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [2]:
fname = os.path.join(
    'Data',
    'kickstarter_data.json'
)
df = pd.read_json(fname, 'records')

In [3]:
total_pitches = df.shape[0]
pitches_with_backer_data = df.loc[df.goal_and_pledged_backers != '', ].shape[0]
percent_with_backer_data = pitches_with_backer_data / total_pitches
args = (percent_with_backer_data, pitches_with_backer_data, total_pitches)
print('{:.1%} of pitches ({:0,.0f} out of {:0,.0f} total) have backer data'.format(*args))

100.0% of pitches (2,384 out of 2,384 total) have backer data


In [4]:
df.loc[df.goal_and_pledged_backers == '', 'url'].head()

Series([], Name: url, dtype: object)

In [5]:
"""
Remove pitches without any goal, pledged and backer data
"""
mask = df.goal_and_pledged_backers != ''
df = (df.loc[mask, :]
    .reset_index()
)

In [6]:
pat = r'(?P<pledged>.+)\npledged of (?P<goal>.+) goal\n[,\d]+\nbackers?'
df[['pledged', 'goal']] = df.goal_and_pledged_backers.str.extract(pat)

In [10]:
"""
Convert pledged and goals into respective currencies and amount
Output to csv to check in excel
"""
df = df.assign(
    pledged_currency=lambda x: x.pledged.str.extract('([^\d]+)'),
    pledged_amount=lambda x: x.pledged.astype(str).map(lambda x: ''.join([i for i in x if i.isdigit()])).astype(int),
    goal_currency=lambda x: x.goal.str.extract('([^\d]+)'),
    goal_amount=lambda x: x.goal.astype(str).map(lambda x: ''.join([i for i in x if i.isdigit()])).astype(int),
)
columns = [
    'goal_and_pledged_backers',
    'pledged',
    'pledged_currency',
    'pledged_amount',
    'goal',
    'goal_currency',
    'goal_amount',
]
test_fname = os.path.join(
    'Data',
    'ks_goal_pledged_test.csv'
)
(df.loc[:, columns]
    .to_csv(test_fname)
)

In [11]:
df.loc[:, 'success'] = np.where(df.pledged_amount >= df.goal_amount, 1, 0)

In [13]:
"""
Balanced Classes!
"""
df.groupby('success').size()

success
0    1149
1    1235
dtype: int64

In [22]:
documents = df.project_description.values
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
doc_vectors = vectorizer.fit_transform(documents).toarray()
y_true = df.loc[:, 'success']

In [23]:
model = GaussianNB().fit(doc_vectors, y_true)

In [32]:
y_pred = model.predict(doc_vectors)
"Accuracy Score:{:-10.2%}".format(accuracy_score(y_true, y_pred))

'Accuracy Score:    99.62%'