In [1]:
import pandas as pd
import numpy as np
import datetime

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder

from sklearn.linear_model import RidgeClassifier

In [2]:
df_kickstarter = pd.read_csv('Kickstarter_dir/Kickstarter.csv')

for i in range(1, 58): 
    if i < 10:
        path = f'Kickstarter_dir/Kickstarter00{i}.csv'
    else:
        path = f'Kickstarter_dir/Kickstarter0{i}.csv'
    
    try:
        temp_df = pd.read_csv(path)
    except:
        continue
        
    df_kickstarter = pd.concat([df_kickstarter, temp_df])

FileNotFoundError: [Errno 2] No such file or directory: 'Kickstarter_dir/Kickstarter.csv'

In [None]:
df_kickstarter.shape

(210835, 38)

In [None]:
def clean_df(df):

    ##I dropped is_starrable and disable_communication because they only applied to live Kickstarters
    #Only the needed columns for MVP dropping leaky high cardinality and redundant columns.
    df = df[['blurb', 'category', 'country', 'created_at', 'deadline', 'goal',
           'launched_at', 'location', 'name', 'state', 'usd_type']]

    #Target Feature cleaning
#     df = df.drop(df[df['state'] == 'live'].index)
    df = df[df['state'] != 'live']
    df['state'] = (df['state'] == 'successful').astype(int)
    df['target'] = df['state']

    #Int to Datetime
    df['launched_at'] = [datetime.datetime.fromtimestamp(date) for date in df['launched_at']]
    df['created_at'] = [datetime.datetime.fromtimestamp(date) for date in df['created_at']]
    df['deadline'] = [datetime.datetime.fromtimestamp(date) for date in df['deadline']]

    #New feature to replace datetimes
    df['days_allotted'] = df.apply(lambda x: str(x.launched_at - x.deadline), axis=1)
    df['days_allotted'] = [int(x[1:x.find(' d')]) for x in df['days_allotted']]
    df['days_before_launch'] = df.apply(lambda x: str(x.created_at - x.launched_at), axis=1)
    df['days_before_launch'] = [int(x[1:x.find(' d')])-1 for x in df['days_before_launch']]  

    #Setting the time of creation as index and dropping those two columns
    df.index = df['created_at']
    df = df.drop(columns = ['launched_at', 'created_at', 'deadline'])

    #Replace Location Nans with 'Not Listed after next step' NOTE FOR FRONT END: This can be optional if converted.
    df['location'] = df['location'].replace(np.nan, '":"     Not Listed ', regex=True)

    #Slimming down Category and Location and adding state
    df['category'] = [x[x.find('":"')+3:x.find('","')] for x in df['category']]
    df['state'] = [x[x.find('"state":"')+9:x.find('","type')] for x in df['location']]
    df['location'] = [x[x.find('":"')+3:x.find('","')] for x in df['location']]

    return(df)

df = clean_df(df_kickstarter)

df = df.drop_duplicates()

df.shape

(191018, 11)

In [None]:
df.head()

Unnamed: 0_level_0,blurb,category,country,goal,location,name,state,usd_type,target,days_allotted,days_before_launch
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-07-26 17:49:28,"Adorable enamel pins, stickers, prints, key ch...",Digital Art,US,700.0,Dover-Foxcroft,Adorable Alien Enamel Pins (Canceled),ME,domestic,0,30,24
2015-08-18 21:01:19,With your help we will create this device that...,3D Printing,ES,15000.0,Mexico,Save water 100% liquid downloads in toilets at...,Yucatan,domestic,0,30,295
2015-08-11 18:04:21,We at Ormiston Primary are looking at starting...,Farms,NZ,5000.0,Flat Bush,Ormiston Primary Community Garden,Auckland Region,domestic,0,30,0
2015-04-28 21:14:03,Self-taught aspiring metalsmith Looking for he...,Mixed Media,US,10000.0,Jackson,"Aspiring metalsmith in need of better tools, a...",MS,domestic,0,30,0
2014-07-07 01:30:52,So many women believe they are past their prim...,People,US,2000.0,Austin,Beauty At Any Age,TX,domestic,0,30,80


In [None]:
df.dropna(inplace=True)

In [None]:
df.shape

(190665, 11)

In [None]:
# df_clean = df.sample(100000)

In [None]:
y = df['target']
X = df.drop(columns=['target', 'name'])
X.head()

Unnamed: 0_level_0,blurb,category,country,goal,location,state,usd_type,days_allotted,days_before_launch
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020-07-26 17:49:28,"Adorable enamel pins, stickers, prints, key ch...",Digital Art,US,700.0,Dover-Foxcroft,ME,domestic,30,24
2015-08-18 21:01:19,With your help we will create this device that...,3D Printing,ES,15000.0,Mexico,Yucatan,domestic,30,295
2015-08-11 18:04:21,We at Ormiston Primary are looking at starting...,Farms,NZ,5000.0,Flat Bush,Auckland Region,domestic,30,0
2015-04-28 21:14:03,Self-taught aspiring metalsmith Looking for he...,Mixed Media,US,10000.0,Jackson,MS,domestic,30,0
2014-07-07 01:30:52,So many women believe they are past their prim...,People,US,2000.0,Austin,TX,domestic,30,80


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((152532, 9), (38133, 9), (152532,), (38133,))

In [None]:
# Create Function Transformer to use Feature Union
def get_numeric_data(df):
    return df.drop(columns=['blurb'])

def get_text_data(df):
    return df['blurb']

transfomer_numeric = FunctionTransformer(get_numeric_data)
transformer_text = FunctionTransformer(get_text_data)

# create tfidf instance
tfidf = TfidfVectorizer()

# Create a pipeline to concatenate Tfidf Vector and Numeric data
pipe = Pipeline([
    ('features', FeatureUnion([
            ('numeric_features', Pipeline([
                ('selector', transfomer_numeric),
                ('ohe', OneHotEncoder(handle_unknown='ignore'))
            ])),
             ('text_features', Pipeline([
                ('selector', transformer_text),
                ('vec', TfidfVectorizer())
            ]))
         ])),
    ('clf', RidgeClassifier())
])

ModuleNotFoundError: No module named '__main__.my_functions'; '__main__' is not a package

In [None]:
# create parameter dict
params = {
    'clf__alpha':[0.1, 0.3, 1, 3, 10],
}

# instantiate a grid search object
gs = GridSearchCV(pipe, params, cv=10, verbose=1, n_jobs=-2)

In [None]:
# optimize model parameters
gs.fit(X_train, y_train);

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 11 concurrent workers.
[Parallel(n_jobs=-2)]: Done  28 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-2)]: Done  50 out of  50 | elapsed:  2.3min finished


In [None]:
gs.best_score_

0.8204835828292609

In [None]:
gs.best_params_

{'clf__alpha': 10}

In [None]:
best_model = gs.best_estimator_

In [None]:
print('Training Accuracy:', best_model.score(X_train, y_train))
print('Validation Accuracy:', best_model.score(X_test, y_test))

Training Accuracy: 0.8580035664647419
Validation Accuracy: 0.8181889701833058


In [None]:
import pickle

# save the model to disk
filename = 'ks_model.sav'
pickle.dump(gs.best_estimator_, open(filename, 'wb'))

In [None]:
import pickle
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
sample_text = ['Are you tired of things breaking things, then having to throw them away. Well we invented glue!']

sample_vect = tfidf.fit_transform(sample_text)

y_hat = loaded_model.predict(pd.DataFrame({'blurb': sample_text,
                          'category':'Graphic Design',
                          'country':'US',
                          'goal':100.0,
                          'location':'San Diego',
                          'state':'CA',
                          'usd_type':'domestic',
                          'days_allotted':0,
                          'days_before_launch':0}))

In [None]:
y_hat

array([0])

In [None]:
from joblib import dump, load
dump(best_model, 'ks_model.joblib')

['ks_model.joblib']

In [None]:
from joblib import dump, load
loaded_model = load('ks_model.joblib')

In [None]:
y_hat = loaded_model.predict(pd.DataFrame({'blurb': sample_text,
                          'category':'Product Design',
                          'country':'US',
                          'goal':100.0,
                          'location':'San Diego',
                          'state':'CA',
                          'usd_type':'domestic',
                          'days_allotted':0,
                          'days_before_launch':0}))

In [None]:
y_hat

array([1])

In [None]:
import dill
with open('numeric_f.joblib','wb') as io:
    dill.dump(transfomer_numeric,io)
    
with open('text_f.joblib','wb') as io:
    dill.dump(transformer_text,io)