In [None]:
!pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import json
import re
import matplotlib.pyplot as plt
import spacy

from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline

from tensorflow.keras import Model, Sequential
from tensorflow.keras.optimizers import Adam, Nadam, SGD
from tensorflow.keras.layers import Dense, Embedding, Input, Bidirectional, LSTM, concatenate
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from xgboost import XGBClassifier

In [None]:
# Load first file
df = pd.read_csv('Kickstarter.csv')

In [None]:
# Loop through 8 files and concat to original DF
for i in range(8):
  df_ = pd.read_csv(f'Kickstarter00{i+1}.csv')
  df = pd.concat([df, df_])

In [None]:
def clean_data(df):

  # Removing duplicate entries then set 'id' as index
  df.drop_duplicates(subset='id', inplace=True)
  df.set_index('id', inplace=True)

  # Drop columns with 99% null values
  df.drop(columns=['friends', 'is_backing', 'is_starred', 'permissions'], inplace=True)

  # Drop rows where state is not 'successful' or 'failed'.  We are looking at binary outcomes
  df = df[(df['state'] == 'successful')|(df['state'] == 'failed')]

  # Dropping high cardinality, redundant, and uninteresting columns
  df = df.drop(columns=['country_displayable_name', 'creator', 'currency_symbol', 'name', 'photo', 'profile', 'source_url', 'urls', 'usd_type'])

  # Dropping columns with only 1 unique value
  df = df.drop(columns=['disable_communication', 'is_starrable'])

  # Dropping leaky columns and currency exchange columns
  df = df.drop(columns=['converted_pledged_amount', 'currency', 'currency_trailing_code', 'current_currency', 'fx_rate', 'pledged', 'static_usd_rate', 'usd_exchange_rate', 'usd_pledged'])

  # Creating 'campaign_length' feature
  df['campaign_length'] = df['deadline'] - df['launched_at']

  # Dropping columns which can't be tinkered by user
  df.drop(columns=['country', 'created_at', 'deadline', 'launched_at', 'state_changed_at', 'spotlight', 'location', 'slug', 'backers_count'], inplace=True)

  # Pull the category names out and store in a list
  dict_list = []
  for entry in df['category']:
    category = json.loads(entry)
    dict_list.append(category['name'])

  # Create new category column with just the category and not dictionaries
  df['cat'] = dict_list

  # Drop old category
  df.drop(columns='category', inplace=True)

  # Create 'word_count' feature
  description_lengths = [len(description.split()) for description in df['blurb']]
  df['word_count'] = description_lengths

  # Make 'staff_pick' column integers
  df['staff_pick'] = df['staff_pick'].astype('int64')

  # Re-order columns
  df = df[['blurb', 'cat', 'word_count', 'campaign_length', 'goal', 'staff_pick', 'state']]
  
  return df

In [None]:
df = clean_data(df)

In [None]:
df.head(2)

Unnamed: 0_level_0,blurb,cat,word_count,campaign_length,goal,staff_pick,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1837982762,Create 200 frames as animated asymmetric tesse...,Conceptual Art,17,2592000,2000.0,0,failed
1820905478,SixNip will be recording its debut album start...,Rock,15,3024000,420.0,0,successful


In [None]:
cat_dict = {}
for i, cat in enumerate(df['cat'].unique()):
    cat_dict[cat] = i

In [None]:
df['cat'] = df['cat'].map(cat_dict)

In [None]:
df.head(2)

Unnamed: 0_level_0,blurb,cat,word_count,campaign_length,goal,staff_pick,state
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1837982762,Create 200 frames as animated asymmetric tesse...,0,17,2592000,2000.0,0,failed
1820905478,SixNip will be recording its debut album start...,1,15,3024000,420.0,0,successful


In [None]:
# Pull out target variable
y = df['state']

In [None]:
# Convert target variable to numeric labels
y = y.map({'successful': 1, 'failed': 0})

In [None]:
# Creating Feature Matrix by dropping target variable
X = df.drop(columns='state')

In [None]:
def clean_text(text):
    """
    Accepts a single text document and performs several regex substitutions in order to clean the document. 
    
    Parameters
    ----------
    text: string or object 
    
    Returns
    -------
    text: string or object
    """
    
    # order of operations - apply the expression from top to bottom
    non_alpha = '[^a-zA-Z]'
    multi_white_spaces = "[ ]{2,}"
    single_letter_words = '(\s[a-zA-Z]\s)'
    
    text = re.sub(non_alpha, ' ', text)
    text = re.sub(single_letter_words, ' ', text)
    text = re.sub(single_letter_words, ' ', text)
    text = re.sub(multi_white_spaces, " ", text)
    
    
    # apply case normalization 
    return text.lower().lstrip().rstrip()

def tokenize(document):
    """
    Takes a doc and returns a string of lemmas after removing stop words.
    """
    
    doc = nlp(document)
    
    tokens = [token.lemma_.strip() for token in doc if (token.is_stop != True) and (token.is_punct != True) and (len(token) > 2)]
    return ' '.join(tokens)

In [None]:
X_clean = [clean_text(text) for text in X['blurb']]

In [None]:
nlp = spacy.load("en_core_web_md")

In [None]:
X_token = [tokenize(text) for text in X_clean]

In [None]:
def get_word_vectors(docs):
    """
    This serves as both our tokenizer and vectorizer. 
    Returns a list of word vectors, i.e. our doc-term matrix
    """
    return [nlp(doc).vector for doc in docs.split()]

In [None]:
X_vect = []
for i, text in enumerate(X_token):
  X_vect.append(get_word_vectors(text))
  if i % 100 == 0:
    print(i)

In [None]:
X_vect = np.array(X_vect)

  """Entry point for launching an IPython kernel.


In [None]:
X_vect

In [None]:
X_vect_np = []
for arr in X_vect:
    X_vect_np.append(np.array(arr))

In [None]:
X_vect_np = np.array(X_vect_np)

  """Entry point for launching an IPython kernel.


In [None]:
X_vect_np[0].shape

(12, 300)

In [None]:
from sklearn.decomposition import PCA

pca = PCA(n_components=5)
X_vect_5D = pca.fit_transform(X_vect_test)

ValueError: ignored

In [None]:
X_vect_5D.shape

(28921, 3)

In [None]:
X_vect_5D_test = []
for arr in X_vect_5D:
    X_vect_5D_test.append(np.expand_dims(arr, axis=1))

In [None]:
X_vect_5D_test = np.array(X_vect_5D_test)

In [None]:
X_vect_5D_test.shape

(28921, 3, 1)

In [None]:
X_meta = df.drop(columns=['blurb', 'state'])

In [None]:
X_meta.shape

(28921, 5)

In [None]:
X_meta = np.array(X_meta)

In [None]:
X_meta_test = []
for arr in X_meta:
    X_meta_test.append(np.expand_dims(arr, axis=1))

In [None]:
X_meta_test = np.array(X_meta_test)

In [None]:
X_meta_test.shape

(28921, 5, 1)

In [None]:
early_stop = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0,
    patience=8,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)


nlp_input = Input(shape=(3, 1))
meta_input = Input(shape=(5, 1))

forward_layer = LSTM(128, return_sequences=True)
backward_layer = LSTM(128, activation='relu', return_sequences=True, go_backwards=True)

nlp_out = Bidirectional(forward_layer, backward_layer=backward_layer)(nlp_input)
meta_out = Dense(256, input_dim=5, activation='relu')(meta_input)

concat = concatenate([nlp_out, meta_out], axis=1)
classifier = Dense(128, activation='relu')(concat)
classifier = Dense(64, activation='relu')(classifier)
classifier = Dense(32, activation='relu')(classifier)
classifier = Dense(16, activation='relu')(classifier)
classifier = Dense(8, activation='relu')(classifier)

output = Dense(1, activation='sigmoid')(classifier)

model = Model(inputs=[nlp_input, meta_input], outputs=output)

model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.fit(
    x=[X_vect_5D_test, X_test_scaled],
    y=y,
    validation_split=0.2,
    shuffle=True,
    batch_size=8,
    epochs=50,
    class_weight={0: 0.34, 1: 0.66},
    workers=-1,
    callbacks=early_stop
)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50


<keras.callbacks.History at 0x7ff384aaead0>

In [None]:
early_stop = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0,
    patience=5,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

model2 = Sequential()

model2.add(Dense(128, input_dim=5, activation='relu'))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(32, activation='relu'))
model2.add(Dense(16, activation='relu'))
model2.add(Dense(8, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))

model2.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

model2.fit(
    x=X_test_2,
    y=y,
    validation_split=0.2,
    shuffle=True,
    batch_size=8,
    epochs=50,
    class_weight={0: 0.34, 1: 0.66},
    workers=-1,
    callbacks=early_stop
)

In [None]:
X_test_ = X.drop(columns='blurb')

In [None]:
X_test_.head(2)

Unnamed: 0_level_0,cat,word_count,campaign_length,goal,staff_pick
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1837982762,0,17,2592000,2000.0,0
1820905478,1,15,3024000,420.0,0


In [None]:
X_test_['campaign_length'] = (X_test_['campaign_length'] - X_test_['campaign_length'].min()) / (X_test_['campaign_length'].max() - X_test_['campaign_length'].min())

In [None]:
X_test_['cat'] = (X_test_['cat'] - X_test_['cat'].min()) / (X_test_['cat'].max() - X_test_['cat'].min())

In [None]:
X_test_['word_count'] = (X_test_['word_count'] - X_test_['word_count'].min()) / (X_test_['word_count'].max() - X_test_['word_count'].min())

In [None]:
X_test_['goal'] = (X_test_['goal'] - X_test_['goal'].min()) / (X_test_['goal'].max() - X_test_['goal'].min())

In [None]:
X_test_ = np.array(X_test_)

In [None]:
X_test_.shape

(28921, 5)

In [None]:
X_test_scaled = []
for arr in X_test_:
    X_test_scaled.append(np.expand_dims(arr, axis=1))

In [None]:
X_test_scaled = np.array(X_test_scaled)

In [None]:
early_stop = EarlyStopping(
    monitor="val_accuracy",
    min_delta=0,
    patience=8,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True,
)

model3 = Sequential()

# model3.add(Dense(512, input_dim=5, activation='relu'))
# model3.add(Dense(256, activation='relu'))
# model3.add(Dense(128, activation='relu'))
model3.add(Dense(64, activation='relu', input_dim=5))
model3.add(Dense(32, activation='relu'))
model3.add(Dense(16, activation='relu'))
model3.add(Dense(8, activation='relu'))
model3.add(Dense(1, activation='sigmoid'))

model3.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])

model3.fit(
    x=X_meta,
    y=y,
    validation_split=0.2,
    shuffle=True,
    batch_size=16,
    epochs=50,
    class_weight={0: 0.34, 1: 0.66},
    workers=-1,
    callbacks=early_stop
)

NameError: ignored

In [None]:
def create_model(units=128, activation= "relu", lr=0.001, opt=Adam):

  model = Sequential()

  model.add(Dense(units=units, input_dim=5, activation=activation))
  model.add(Dense(units=(units/2), activation=activation))
  model.add(Dense(units=(units/4), activation=activation))
  model.add(Dense(units=(units/8), activation=activation))
  model.add(Dense(units=1, activation='sigmoid'))

  model.compile(loss="binary_crossentropy",
                  optimizer=opt(learning_rate=lr),
                  metrics=["accuracy"])

  return model

In [None]:
model = KerasClassifier(build_fn=create_model)

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', model)]
)

In [None]:
params = {
    'model__units': [256, 128, 64, 32],
    'model__batch_size': [256, 128, 64, 32],
    'model__epochs': [40, 50, 60, 70, 80, 90, 100],
    'model__activation': ['relu', 'elu', 'selu', 'sigmoid'],
    'model__lr': [0.0001, 0.001, 0.01]
}

In [None]:
train_X, test_X, train_y, test_y = train_test_split(X_test_, y)

In [None]:
rs = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=params,
    cv=5,
    n_iter=10,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

rs.fit(train_X, train_y)

In [None]:
rs.best_score_

0.7960811376571655

In [None]:
rs.best_params_

{'model__activation': 'relu',
 'model__batch_size': 32,
 'model__epochs': 100,
 'model__lr': 0.001,
 'model__units': 64}

In [None]:
rs.score(test_X, test_y)



0.7844005227088928

In [None]:
pipe2 = Pipeline([
    ('scaler', StandardScaler()),
    ('xgb', XGBClassifier(
        random_state=42,
        n_jobs=-1,
        verbosity=1
    ))]
)

In [None]:
params2 = {
    'xgb__n_estimators': range(30, 75, 10),
    'xgb__learning_rate': [0.1, 0.01, 0.001, 0.0001],
    'xgb__max_depth': range(20, 39, 1),
    'xgb__min_child_weight': range(4, 9, 1),
    'xgb__eta': np.arange(0.1, 0.5, 0.1),
    'xgb__subsample': range(0, 3, 1),
    'xgb__colsample_bytree': range(0, 3, 1)
}

In [None]:
rs_xgb = RandomizedSearchCV(
    estimator=pipe2,
    param_distributions=params2,
    cv=5,
    n_iter=100,
    n_jobs=-1,
    verbose=1,
    random_state=42,
)

rs_xgb.fit(train_X, train_y)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 306 tasks      | elapsed:   53.8s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:  2.0min finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('scaler',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('xgb',
                                              XGBClassifier(base_score=0.5,
                                                            booster='gbtree',
                                                            colsample_bylevel=1,
                                                            colsample_bynode=1,
                                                            colsample_bytree=1,
                                                            gamma=0,
                                                            learning_rate=0.1,
                            

In [None]:
rs_xgb.best_score_

0.8203780544029506

In [None]:
rs_xgb.best_params_

{'xgb__colsample_bytree': 1,
 'xgb__eta': 0.2,
 'xgb__learning_rate': 0.1,
 'xgb__max_depth': 22,
 'xgb__min_child_weight': 6,
 'xgb__n_estimators': 70,
 'xgb__subsample': 1}

In [None]:
rs_xgb.score(test_X, test_y)

0.8312819803623288