In [None]:
import pandas as pd
import numpy as np
import json
import re
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import plot_confusion_matrix, classification_report, accuracy_score, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV

In [None]:
# Load first file
df = pd.read_csv('Kickstarter.csv')

In [None]:
# Loop through 8 files and concat to original DF
for i in range(8):
  df_ = pd.read_csv(f'Kickstarter00{i+1}.csv')
  df = pd.concat([df, df_])

In [None]:
def clean_data(df):

  # Removing duplicate entries then set 'id' as index
  df.drop_duplicates(subset='id', inplace=True)
  df.set_index('id', inplace=True)

  # Drop columns with 99% null values
  df.drop(columns=['friends', 'is_backing', 'is_starred', 'permissions'], inplace=True)

  # Drop rows where state is not 'successful' or 'failed'.  We are looking at binary outcomes
  df = df[(df['state'] == 'successful')|(df['state'] == 'failed')]

  # Dropping high cardinality, redundant, and uninteresting columns
  df = df.drop(columns=['country_displayable_name', 'creator', 'currency_symbol', 'name', 'photo', 'profile', 'source_url', 'urls', 'usd_type'])

  # Dropping columns with only 1 unique value
  df = df.drop(columns=['disable_communication', 'is_starrable'])

  # Dropping leaky columns and currency exchange columns
  df = df.drop(columns=['converted_pledged_amount', 'currency', 'currency_trailing_code', 'current_currency', 'fx_rate', 'pledged', 'static_usd_rate', 'usd_exchange_rate', 'usd_pledged'])

  # Creating 'campaign_length' feature
  df['campaign_length'] = df['deadline'] - df['launched_at']

  # Dropping columns which can't be tinkered by user
  df.drop(columns=['country', 'created_at', 'deadline', 'launched_at', 'state_changed_at', 'spotlight', 'location', 'slug', 'backers_count'], inplace=True)

  # Pull the category names out and store in a list
  dict_list = []
  for entry in df['category']:
    category = json.loads(entry)
    dict_list.append(category['name'])

  # Create new category column with just the category and not dictionaries
  df['cat'] = dict_list

  # Drop old category
  df.drop(columns='category', inplace=True)

  # Create 'word_count' feature
  description_lengths = [len(description.split()) for description in df['blurb']]
  df['word_count'] = description_lengths

  # Make 'staff_pick' column integers
  df['staff_pick'] = df['staff_pick'].astype('int64')

  # Re-order columns
  df = df[['blurb', 'cat', 'word_count', 'campaign_length', 'goal', 'staff_pick', 'state']]
  
  return df

In [None]:
# Cleaning our dataframe
df = clean_data(df)

In [None]:
# Encoding 'catgories'
cat_dict = {}
for i, cat in enumerate(df['cat'].unique()):
    cat_dict[cat] = i

df['cat'] = df['cat'].map(cat_dict)

In [None]:
# Pull out target variable
y = df['state']

In [None]:
# Establish baseline accuracy of 66.5% 'successful'
baseline_accuracy = y.value_counts(normalize=True)[0]
print('Baseline Accuracy:', baseline_accuracy)

In [None]:
# Convert target variable to numeric labels
y = y.map({'successful': 1, 'failed': 0})

In [None]:
# Creating Feature Matrix by dropping target variable
X = df.drop(columns='state')

In [None]:
### BUILD RF MODEL

# dropping text blurb for RF model
X_rf = X.drop(columns=['blurb'])

In [None]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X_rf, y, test_size=0.2, random_state=42)

In [None]:
# Instantiate RF model with parameters already tuned
model = RandomForestClassifier(
          random_state=42,
          n_estimators=140,
          class_weight={0:0.335, 1:0.665},
          max_depth=20,
          max_features=5,
          min_samples_leaf=5,
          min_samples_split=7
)

In [None]:
# Fit model to training data
model.fit(X_train, y_train)

In [None]:
# Plot confusion matrix
plot_confusion_matrix(model, X_test, y_test, values_format = '.0f', display_labels=['failure','success'])

In [None]:
# Print classification report
print(classification_report(y_test, model.predict(X_test), target_names = ['failure','success']))

In [None]:
# Display feature importances
importances = model.feature_importances_
features = X_train.columns
pd.Series(importances, index=features).sort_values().tail(10).plot(kind='barh')