In [117]:
# Imports

import os
import pickle
import datetime

import numpy as np
import pandas as pd
import spacy

from datetime import datetime
from spacy.tokenizer import Tokenizer
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from scipy.stats import randint
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from category_encoders import OneHotEncoder

In [118]:
# Loading .csv

df = pd.read_csv("ks-projects-201801.csv")

In [119]:
# Dropping features, filtering down to only useful observations

df = df.drop(["pledged", "backers", "usd pledged", "usd_pledged_real", "usd_goal_real"], axis=1)

mask = df['currency'] == 'USD'
df = df[mask]
mask = df['country'] == 'US'
df = df[mask]

df = df[(df['state'] == 'successful') | (df['state'] == 'failed')]

In [120]:
# Campaign length feature engineered

df['deadline']= pd.to_datetime(df['deadline'])
df['launched'] = pd.to_datetime(df['launched'])
df['campaign length'] = df['deadline'] - df['launched']

df['campaign length'] = df['campaign length'].astype(str)
df['campaign length'] = df['campaign length'].apply(lambda x: x[:2])

In [121]:
# Dropping other features

df = df.drop(['deadline', 'launched', 'country', 'ID', 'currency', 'main_category'], axis=1)

In [122]:
# Using small sample of dataframe to reduce computational time

df = df.sample(1000)

In [124]:
# NLP for vectorization, pipeline using OHE and GBC predictor

nlp = spacy.load("en_core_web_md")
gbc = GradientBoostingClassifier(loss='deviance')
ohe = OneHotEncoder()
pipe = Pipeline([
    ('encoder', ohe),
    ('clf', gbc)
    ])

In [125]:
# Vectorization of text feature

df2 = pd.DataFrame([nlp(str(d)).vector for d in df['name']])

In [134]:
# Adding features onto vector dataframe

df.index = df2.index
df2['category'] = df['category']
df2['goal'] = df['goal']
df2['campaign length'] = df['campaign length'].astype(int)

In [135]:
# Feature matrix and target

X = df2
y = df['state']

In [136]:
# Train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=1, 
                                                    shuffle=True,
                                                    stratify=y)

In [137]:
# Fitting model

pipe.fit(X,y)

  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('encoder',
                 OneHotEncoder(cols=['category', 'campaign length'])),
                ('clf', GradientBoostingClassifier())])

In [139]:
# Pickling model

gbnlp = open('gbnlp_pickle', 'wb') 
pickle.dump(pipe, gbnlp)

In [140]:
# Loading pickled model

loaded_model = pickle.load(open('gbnlp_pickle', 'rb'))

In [141]:
type(df2.head(1))

pandas.core.frame.DataFrame

In [142]:
loaded_model.predict(X.head(1))

array(['failed'], dtype=object)

In [181]:
goal = 500
length=30
description='Do it'
category='Game'

In [182]:
desc_vectorized = pd.DataFrame(nlp(description).vector)

In [183]:
X_new = desc_vectorized.T
X_new['category'] = category
X_new['goal'] = goal
X_new['campaign length'] = length

In [184]:
X_new

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,293,294,295,296,297,298,299,category,goal,campaign length
0,-0.006584,0.142384,-0.325879,-0.163085,0.007788,0.021567,-0.104288,-0.079318,-0.15804,2.58415,...,-0.113279,0.101291,-0.08756,-0.18719,-0.1184,0.357899,0.1733,Game,500,30


In [185]:
loaded_model.predict(X_new)

array(['successful'], dtype=object)