In [7]:
import os
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [4]:
fname = os.path.join(
    'Data',
    'kickstarter_data.json'
)
df = pd.read_json(fname, 'records')

In [35]:
"""
A lot of these are blank
"""
pat = r'.+\npledged of .+ goal\n[,\d]+\nbackers?'
print(df.loc[~df.goal_and_pledged_backers.str.match(pat), 'goal_and_pledged_backers'].shape)
df.loc[~df.goal_and_pledged_backers.str.match(pat), 'goal_and_pledged_backers'].unique()

(1059,)


array([''], dtype=object)

In [42]:
"""
Remove pitches without any goal, pledged and backer data
"""
mask = df.goal_and_pledged_backers != ''
df = (df.loc[mask, :]
    .reset_index()
)

In [50]:
pat = r'(?P<pledged>.+)\npledged of (?P<goal>.+) goal\n[,\d]+\nbackers?'
df.loc[:, ['pledged', 'goal']] = df.goal_and_pledged_backers.str.extract(pat)

In [66]:
"""
Convert pledged and goals into respective currencies and amount
Output to csv to check in excel
"""
df = df.assign(
    pledged_currency=lambda x: x.pledged.str.extract('([^\d]+)'),
    pledged_amount=lambda x: x.pledged.map(lambda x: ''.join([i for i in x if i.isdigit()])).astype(int),
    goal_currency=lambda x: x.goal.str.extract('([^\d]+)'),
    goal_amount=lambda x: x.goal.map(lambda x: ''.join([i for i in x if i.isdigit()])).astype(int),
)
columns = [
    'goal_and_pledged_backers',
    'pledged',
    'pledged_currency',
    'pledged_amount',
    'goal',
    'goal_currency',
    'goal_amount',
]
test_fname = os.path.join(
    'Data',
    'ks_goal_pledged_test.csv'
)
(df.loc[:, columns]
    .to_csv(test_fname)
)

In [69]:
df.loc[:, 'success'] = np.where(df.pledged_amount >= df.goal_amount, 1, 0)

In [79]:
df.groupby('success').size()

success
0      99
1    1229
dtype: int64

In [72]:
documents = df.project_description.values
vectorizer = TfidfVectorizer(stop_words="english", ngram_range=(1,2))
doc_vectors = vectorizer.fit_transform(documents)

In [73]:
model = MultinomialNB().fit(doc_vectors, df.loc[:, 'success'])

In [76]:
model.predict(doc_vectors).sum()

1328

In [81]:
"""
Currently the model is only predicted success'
Need to get more failures
"""

"\nCurrently the model is only predicted success'\nNeed to get more failures\n"