# Kickstarter ML Project

## Preparation

In [1]:
import pandas as pd
import numpy as np
import glob

import sweetviz as sv
import json


RSEED = 42

### Load Data

In [2]:
# Load csv's and merge to a single dataframe

#path = "/Users/bur.oez/neuefische/Kickstarter-ML-Project/data/*.csv" # Burak's path
path = "data/*.csv" # Christian's path
#path = ".../*.csv"" # Matthias's path

all_files = glob.glob(path)
df_raw = pd.concat((pd.read_csv(f) for f in all_files))

### Overview

In [3]:
## Execute Sweetviz 
#my_report = sv.analyze(df_raw)
#my_report.show_html() # Default arguments will generate to "SWEETVIZ_REPORT.html"

In [4]:
df_raw.columns

Index(['backers_count', 'blurb', 'category', 'converted_pledged_amount',
       'country', 'created_at', 'creator', 'currency', 'currency_symbol',
       'currency_trailing_code', 'current_currency', 'deadline',
       'disable_communication', 'friends', 'fx_rate', 'goal', 'id',
       'is_backing', 'is_starrable', 'is_starred', 'launched_at', 'location',
       'name', 'permissions', 'photo', 'pledged', 'profile', 'slug',
       'source_url', 'spotlight', 'staff_pick', 'state', 'state_changed_at',
       'static_usd_rate', 'urls', 'usd_pledged', 'usd_type'],
      dtype='object')

## Data Cleaning

For safety, we operate on a copy of the data frame.

In [5]:
df = df_raw.copy()

For a basic feature analysis, we may focus on easily accessible features. Consequently, we drop the cols
- `blurb`, `creator`, `slug`, `name` and `photo`, because they might only be exploitable through a semantic or context analysis,
- `currency_symbol`, `currency_trailing_code`, because they are redundant,
r a basic feature analysis, we may focus on easily accessible features. Consequently, we drop the cols
- `blurb`, `creator`, `slug`, `name` and `photo`, because they might only be exploitable through a semantic or context analysis,
- `currency_symbol`, `currency_trailing_code`, because they are redundant,
- `friends`, ìs_starrable` and `permission`, because they do not contain any information,
- `disable_communication`, `is_backing` and `is_starred`, because they have an entry only for the same 300 data points and it is questionable whether the missing entries may be treated as one category; later on we might try this with one of them and drop the other two,  
- `urls`, `source_url` and `profile` because they do not contain additional information.
- `spotlight` and `staff_pick` will be dropped, because the creator has no influence on Kickstarter Staff picks
- `created_at`, `launched_at` and `state_changed_at` will be kept to calculate time-deltas for potential new features.

In [6]:
# dropping columns
df = df.drop(["blurb", "creator", "slug", "name", "photo", "currency_symbol", "currency_trailing_code", "friends", "permissions", "disable_communication", "is_backing", "is_starred","urls", "source_url", "profile", "usd_type", "spotlight", "staff_pick","is_starrable"], axis = 1)
df.head()

Unnamed: 0,backers_count,category,converted_pledged_amount,country,created_at,currency,current_currency,deadline,fx_rate,goal,id,launched_at,location,pledged,state,state_changed_at,static_usd_rate,usd_pledged
0,315,"{""id"":266,""name"":""Footwear"",""slug"":""fashion/fo...",28645,US,1541459205,USD,USD,1552539775,1.0,28000.0,2108505034,1548223375,"{""id"":2462429,""name"":""Novato"",""slug"":""novato-c...",28645.0,live,1548223375,1.0,28645.0
1,47,"{""id"":273,""name"":""Playing Cards"",""slug"":""games...",1950,US,1501684093,USD,USD,1504976459,1.0,1000.0,928751314,1502384459,"{""id"":2400549,""name"":""Euless"",""slug"":""euless-t...",1950.0,successful,1504976459,1.0,1950.0
2,271,"{""id"":43,""name"":""Rock"",""slug"":""music/rock"",""po...",22404,US,1348987533,USD,USD,1371013395,1.0,15000.0,928014092,1368421395,"{""id"":2423474,""name"":""Hollywood"",""slug"":""holly...",22404.0,successful,1371013395,1.0,22404.0
3,3,"{""id"":273,""name"":""Playing Cards"",""slug"":""games...",165,GB,1483780271,GBP,USD,1489425776,1.308394,10000.0,596091328,1484245376,"{""id"":475457,""name"":""Kaunas"",""slug"":""kaunas-ka...",136.0,failed,1489425776,1.216066,165.384934
4,3,"{""id"":48,""name"":""Nonfiction"",""slug"":""publishin...",2820,US,1354817071,USD,USD,1357763527,1.0,2800.0,998516049,1355171527,"{""id"":2507703,""name"":""Traverse City"",""slug"":""t...",2820.0,successful,1357763527,1.0,2820.0


Extract field and subfield ids from `category` (id and parent_id):

In [7]:
import ast
df.category =[ ast.literal_eval(x) for x in df.iloc[:,df.columns.tolist().index("category")]]

In [8]:
# checking which rows do not contain "parent_id" inside category dictionary
cat_list_parent_id = np.array(["parent_id" not in df.category.iloc[x].keys() for x in range(0,len(df.category))])
cat_list_parent_id.sum()

9041

In [9]:
df[cat_list_parent_id].iloc[0:3]

Unnamed: 0,backers_count,category,converted_pledged_amount,country,created_at,currency,current_currency,deadline,fx_rate,goal,id,launched_at,location,pledged,state,state_changed_at,static_usd_rate,usd_pledged
7,33,"{'id': 14, 'name': 'Music', 'slug': 'music', '...",660,US,1546965483,USD,USD,1550067305,1.0,400.0,1481360049,1547475305,"{""id"":2430903,""name"":""Kaysville"",""slug"":""kaysv...",660.0,successful,1550067307,1.0,660.0
25,74,"{'id': 15, 'name': 'Photography', 'slug': 'pho...",2808,GB,1542202969,GBP,USD,1544200381,1.308394,2000.0,865110410,1542385981,"{""id"":12056,""name"":""Bath"",""slug"":""bath-gb"",""sh...",2199.0,successful,1544200381,1.301163,2861.258251
27,41,"{'id': 9, 'name': 'Fashion', 'slug': 'fashion'...",21161,GB,1551345787,GBP,CAD,1554490790,1.748586,20000.0,1889386358,1551898790,"{""id"":40611,""name"":""Winchester"",""slug"":""winche...",12102.0,live,1551898791,1.315996,15926.177904


In [10]:
df.category.iloc[0]

{'id': 266,
 'name': 'Footwear',
 'slug': 'fashion/footwear',
 'position': 5,
 'parent_id': 9,
 'color': 16752598,
 'urls': {'web': {'discover': 'http://www.kickstarter.com/discover/categories/fashion/footwear'}}}

The `id` column represents each subcategory. `parent_id` represents each parent category, since the values for `id` start after `parent_id` values, we will use `id` to seperate each project into categories.

e `id` column represents each subcategory. `parent_id` represents each parent category, since the values for `id` start after `parent_id` values, we will use `id` to seperate each project into categories.

To projects without proper subcategorisation the `parent_id` is assigned. In addition `slug`is added for increased readability.

In [11]:
df["cat_id"] = [df.category.iloc[x]["id"] for x in range(0,len(df.category))]
df["slug"] = [df.category.iloc[x]["slug"] for x in range(0,len(df.category))]
categories = df.pop("category")

In [12]:
df.head(3)
print(df.shape)

(209222, 19)


In [13]:
# check if currency - USD conversion with fx_rate and static_usd_rate matches usd_pledged value
print("pledged * static_usd_rate == usd_pledged")
print(df.eval("pledged * static_usd_rate == usd_pledged").value_counts())
print("pledged * fx_rate == usd_pledged")
print(df.eval("pledged * fx_rate == usd_pledged").value_counts())
print("converted_pledged_amount == usd_pledged")
print(df.eval("converted_pledged_amount == usd_pledged").value_counts())
print("pledged * static_usd_rate == converted_pledged_amount")
print(df.eval("pledged * static_usd_rate == converted_pledged_amount").value_counts())
print("pledged * fx_rate == converted_pledged_amount")
print(df.eval("pledged * fx_rate == converted_pledged_amount").value_counts())

pledged * static_usd_rate == usd_pledged
True     190548
False     18674
dtype: int64
pledged * fx_rate == usd_pledged
True     153959
False     55263
dtype: int64
converted_pledged_amount == usd_pledged
True     140490
False     68732
dtype: int64
pledged * static_usd_rate == converted_pledged_amount
True     140490
False     68732
dtype: int64
pledged * fx_rate == converted_pledged_amount
True     140473
False     68749
dtype: int64


In [14]:
print(df.eval("converted_pledged_amount == pledged + usd_pledged").value_counts())

False    190860
True      18362
dtype: int64


The values in `usd_pledged` do not match with the conversion of `pledged` using `fx_rate` or `static_usd_rate`. 

We decided to take the max value for these columns as the correct amount.

In [15]:
# checking whether pledged, usd_pledged or converted_pledged_amount has the highest values
df["max_pledged"] = [max(df.pledged.iloc[x], 
df.usd_pledged.iloc[x], 
df.converted_pledged_amount.iloc[x]) for x in range(0,len(df))]
df = df.drop(["usd_pledged", "pledged", "converted_pledged_amount"], axis = 1)

In [16]:
# in addition all other currency info can also be dropped
df = df.drop(["static_usd_rate", "fx_rate", "currency", "current_currency"], axis = 1)

In [17]:
# set checkpoint
df2 = df.copy()
df.head(2)
print(df.shape)

(209222, 13)


Now we will check whether `live` projects have met their goal and can be considered as `succesful`


In [18]:
df.loc[df['max_pledged'] >= df.goal, 'state'] = "successful"
#check whether there are live:successsful projects left
df.query("state == 'live' and max_pledged >= goal").state

Series([], Name: state, dtype: object)

In [19]:
df = df2.copy()
df.shape

(209222, 13)

Now we will drop all entries, which are not `successful` or `failed`


In [20]:
df.state.value_counts()

successful    117465
failed         75199
canceled        8624
live            7311
suspended        623
Name: state, dtype: int64

In [21]:
df = df[df.state != "canceled"]
df = df[df.state != "suspended"]
df = df[df.state != "live"]
df.state.unique()

array(['successful', 'failed'], dtype=object)

In [22]:
df.shape

(192664, 13)

Sort `countries` into categories US, NA, SEA, GB, ANZ, JP, EU:

In [23]:
pd.unique(df.country)

array(['US', 'GB', 'FR', 'AU', 'NZ', 'ES', 'IT', 'NO', 'NL', 'CA', 'SG',
       'MX', 'SE', 'IE', 'DE', 'BE', 'HK', 'AT', 'JP', 'DK', 'CH', 'LU'],
      dtype=object)

In [24]:
df.country.replace({'HK':"SEA", 'NL':"EU", 'AU':"ANZ", 'DE':"EU", "CA":"NA", 'SE':"EU",
                    'BE':"EU", 'MX':"NA", 'CH':"EU", 'SG':"SEA", 'FR':"EU", 'IT':"EU",
                    'DK':"EU", 'LU':"EU", 'NO':"EU", 'ES':"EU", 'IE':"EU", 'NZ':"ANZ", 
                    'AT':"EU"}, inplace = True)

Turn entries of `created_at`, `launched`, `deadline` into datetime:

In [25]:
df.created_at = pd.to_datetime(df.iloc[:,df.columns.tolist().index("created_at")],unit='s')
df.launched_at = pd.to_datetime(df.iloc[:,df.columns.tolist().index("launched_at")],unit='s')
df.deadline = pd.to_datetime(df.iloc[:,df.columns.tolist().index("deadline")],unit='s')
df.state_changed_at = pd.to_datetime(df.iloc[:,df.columns.tolist().index("state_changed_at")],unit='s')
df.head()

Unnamed: 0,backers_count,country,created_at,deadline,goal,id,launched_at,location,state,state_changed_at,cat_id,slug,max_pledged
1,47,US,2017-08-02 14:28:13,2017-09-09 17:00:59,1000.0,928751314,2017-08-10 17:00:59,"{""id"":2400549,""name"":""Euless"",""slug"":""euless-t...",successful,2017-09-09 17:00:59,273,games/playing cards,1950.0
2,271,US,2012-09-30 06:45:33,2013-06-12 05:03:15,15000.0,928014092,2013-05-13 05:03:15,"{""id"":2423474,""name"":""Hollywood"",""slug"":""holly...",successful,2013-06-12 05:03:15,43,music/rock,22404.0
3,3,GB,2017-01-07 09:11:11,2017-03-13 17:22:56,10000.0,596091328,2017-01-12 18:22:56,"{""id"":475457,""name"":""Kaunas"",""slug"":""kaunas-ka...",failed,2017-03-13 17:22:56,273,games/playing cards,165.384934
4,3,US,2012-12-06 18:04:31,2013-01-09 20:32:07,2800.0,998516049,2012-12-10 20:32:07,"{""id"":2507703,""name"":""Traverse City"",""slug"":""t...",successful,2013-01-09 20:32:07,48,publishing/nonfiction,2820.0
5,35,US,2014-10-24 17:35:50,2015-05-02 02:25:46,3500.0,1224600291,2015-04-02 02:25:46,"{""id"":2354877,""name"":""Annapolis"",""slug"":""annap...",successful,2015-05-02 02:25:46,36,music/classical music,3725.0


In [26]:
df.shape

(192664, 13)

In [27]:
# set checkpoint
df2 = df.copy()

Check for `id` copies and remove if they refer to the same data point: multiplicities of entries:

In [28]:
print(*df.id.value_counts().unique())

2 1


List of multiply used `id` entries:

In [29]:
dic = df.id.value_counts()
multiples = np.array([key for key in dic.keys() if dic[key] > 1])
len(multiples)

23685

They don't differ in any features, except one.

In [30]:
#for id in multiples:
#    print((id, [c for c in df.columns if df.query('id == '+str(id))[c].nunique() != 1]))


In that case, only one of the rows with the same `id` entry need to be kept.

In [31]:
df.drop_duplicates(subset = "id", keep = "first", inplace = True)

In [32]:
df.shape

(168979, 13)

Now we will take a look in to `location` and try to extract some meaningful information.

Fist we will drop NaN rows and then convert the `json-str` into a dictonary to extract keys and values.

In [33]:
df.location.isnull().sum()

213

In [34]:
df.dropna(axis = 0, subset = ["location"], inplace = True)

In [35]:
df.location =[json.loads(x) for x in df.location]

In [36]:
df["location_type"] = [df.location.iloc[x]["type"] for x in range(0,len(df.location))]
df["location_city"] = [df.location.iloc[x]["short_name"] for x in range(0,len(df.location))]
df["location_state"] = [df.location.iloc[x]["state"] for x in range(0,len(df.location))]

In [37]:
df.location_type.value_counts()

Town             156447
County             6384
Suburb             4295
LocalAdmin          986
Zip                 409
Island              209
Country              17
Miscellaneous        15
Estate                4
Name: location_type, dtype: int64

Remaining questions:
- What to do with location? Extract cities and try to assign some score to each? To test this, extract a few big cities and check for correlation with target.

In [38]:
df.shape

(168766, 16)

## Break // CHRISTIAN :: Preparing the Data

In [39]:
# Import of relevant packages
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate
from sklearn.metrics import roc_curve, confusion_matrix, accuracy_score, recall_score, precision_score

from sklearn.linear_model import LogisticRegression

# Set random seed 
RSEED = 42

In [40]:
df.head();

In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168766 entries, 1 to 3791
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   backers_count     168766 non-null  int64         
 1   country           168766 non-null  object        
 2   created_at        168766 non-null  datetime64[ns]
 3   deadline          168766 non-null  datetime64[ns]
 4   goal              168766 non-null  float64       
 5   id                168766 non-null  int64         
 6   launched_at       168766 non-null  datetime64[ns]
 7   location          168766 non-null  object        
 8   state             168766 non-null  object        
 9   state_changed_at  168766 non-null  datetime64[ns]
 10  cat_id            168766 non-null  int64         
 11  slug              168766 non-null  object        
 12  max_pledged       168766 non-null  float64       
 13  location_type     168766 non-null  object        
 14  locati

In [42]:
df_cd = df.drop(['backers_count', 'state_changed_at', 'max_pledged', "launched_at", "created_at", "deadline"], axis = 1);

In [43]:

# Transform type objects to string

df_cd['country'] = df_cd['country'].astype(str)
df_cd['slug'] = df_cd['slug'].astype(str)
df_cd['location'] = df_cd['location'].astype(str)
df_cd['location_city'] = df_cd['location_city'].astype(str)
df_cd['location_state'] = df_cd['location_state'].astype(str)

In [44]:
# Creating list for categorical predictors/features 
# (dates are also objects so if you have them in your data you would deal with them first)
cat_features = list(df_cd.columns[df_cd.dtypes==object])
cat_features.remove('state')
cat_features.remove('location')
cat_features

['country', 'slug', 'location_type', 'location_city', 'location_state']

In [45]:
# Creating list for numerical predictors/features
# Since 'Survived' is our target variable we will exclude this feature from this list of numerical predictors 
num_features = list(df_cd.columns[df_cd.dtypes!=object])
num_features

['goal', 'id', 'cat_id']

In [46]:
sum(df_cd["state"].value_counts())

168766

In [47]:
print("In our cleaned data set are", df_cd.state.value_counts()[0], "successful campaigns and", df_cd.state.value_counts()[1],"failed campaigns. Hereby, the Success-Ratio is:", round((df_cd.state.value_counts()[0]/sum(df_cd["state"].value_counts()))*100,2),"%" )

In our cleaned data set are 94614 successful campaigns and 74152 failed campaigns. Hereby, the Success-Ratio is: 56.06 %


In [48]:
#Somehow, I get an error here. Compare Stackoverflow => https://stackoverflow.com/questions/22470690/get-list-of-pandas-dataframe-columns-based-on-data-type

#print(df_cd.columns.to_series().groupby(df_cd.dtypes).groups

## Train-Test-Split

In [49]:
# Define predictors and target variable
features = df_cd.drop('state', axis=1)
target = df_cd['state']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=RSEED)

In [51]:
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

X_train shape: (135012, 9)
X_test shape: (33754, 9)
y_train shape: (135012,)
y_test shape: (33754,)


## Data Preprocessing

In [52]:

# TODO: One-hot encode the 'features_raw' data using pandas.get_dummies()
features = pd.get_dummies(features)

# TODO: Encode the 'income'/'target' data to numerical values
target = target.apply(lambda x: 1 if x == 'successful' else 0)

# Print the number of features after one-hot encoding
encoded = list(features.columns)
print ("{} total features after one-hot encoding.".format(len(encoded)))

29964 total features after one-hot encoding.


## Shuffle and Split Data

In [53]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)

# Show the results of the split
print ("Training set has {} samples.".format(X_train.shape[0]))
print ("Testing set has {} samples.".format(X_test.shape[0]))

Training set has 135012 samples.
Testing set has 33754 samples.


## Normalizing Numerical Features

In [54]:
df_cd.head(1)

Unnamed: 0,country,goal,id,location,state,cat_id,slug,location_type,location_city,location_state
1,US,1000.0,928751314,"{'id': 2400549, 'name': 'Euless', 'slug': 'eul...",successful,273,games/playing cards,Town,"Euless, TX",TX


In [55]:
# Import sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler()
numerical = ['goal']
X_train[numerical] = scaler.fit_transform(X_train[numerical])
X_test[numerical] = scaler.transform(X_test[numerical])

# Show an example of a record with scaling applied
X_train[numerical].sample(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[numerical] = scaler.fit_transform(X_train[numerical])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[numerical] = scaler.transform(X_test[numerical])
A value is trying to be set on a copy of a slice from a DataFrame.
T

Unnamed: 0,goal
1711,9.2e-05
2429,7e-06
1827,5e-05


# Creating a Training and Predicting Pipeline

In [57]:
# TODO: Import two metrics from sklearn - fbeta_score and accuracy_score
from sklearn.metrics import fbeta_score, accuracy_score

# Define a train and evaluation function 
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    
    # TODO: Fit the learner to the training data using slicing with 'sample_size'
    start = time() # Get start time
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set,
    #       then get predictions on the first 300 training samples
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples
    results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
        
    # TODO: Compute accuracy on test set
    results['acc_test'] = accuracy_score(y_test,predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples
    results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta=0.5)
        
    # TODO: Compute F-score on the test set
    results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5)
       
    # Success
    print ("{} trained on {} samples.".format(learner.__class__.__name__, sample_size))
        
    # Return the results
    return results

## Model Evaluation

In [58]:
from time import time
# TODO: Import the three supervised learning models from sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
#-------- Org. Setup -----#
# TODO: Initialize the three models, the random states are set to 101 so we know how to reproduce the model later
#clf_A = DecisionTreeClassifier(random_state=101)
#clf_B = SVC(random_state=101)
#clf_C = AdaBoostClassifier(random_state=101)

# TODO: Calculate the number of samples for 1%, 10%, and 100% of the training data
#samples_1 = int(round(len(X_train) / 100))
#samples_10 = int(round(len(X_train) / 10))
#samples_100 = len(X_train)
#-------


# TODO: Initialize the three models, the random states are set to 101 so we know how to reproduce the model later
clf_A = DecisionTreeClassifier(random_state=101)
clf_B = SVC(random_state=101)
clf_C = AdaBoostClassifier(random_state=101)





samples_1 = int(round(len(X_train) / 10000))
samples_10 = int(round(len(X_train) / 5000))
samples_100 = int(round(len(X_train) / 2500))


# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

DecisionTreeClassifier trained on 14 samples.
DecisionTreeClassifier trained on 14 samples.
DecisionTreeClassifier trained on 14 samples.
SVC trained on 14 samples.
SVC trained on 14 samples.
SVC trained on 14 samples.
AdaBoostClassifier trained on 14 samples.
AdaBoostClassifier trained on 14 samples.
AdaBoostClassifier trained on 14 samples.


## Stats and Visualizations

In [60]:
# Import functions from own Python-File (see visuals_script.py in Repo)
import visuals_script as vs
# Run metrics visualization for the three supervised learning models chosen
vs.evaluate(results, accuracy, fscore)

NameError: name 'accuracy' is not defined

In [61]:

## => https://github.com/neuefische/ds-ensemble-methods/blob/main/4_SOLUTON_Comparison_Classification_Algorithms.ipynb
#Printing out the values
for i in results.items():
    print (i[0])
    display(pd.DataFrame(i[1]).rename(columns={0:'1%', 1:'10%', 2:'100%'}))

DecisionTreeClassifier


Unnamed: 0,1%,10%,100%
train_time,0.280683,0.155423,0.176187
pred_time,15.356366,13.773053,13.739727
acc_train,0.713333,0.713333,0.713333
acc_test,0.655507,0.655507,0.655507
f_train,0.747198,0.747198,0.747198
f_test,0.69782,0.69782,0.69782


SVC


Unnamed: 0,1%,10%,100%
train_time,0.218095,0.176049,0.168406
pred_time,62.779582,61.648006,61.254126
acc_train,0.443333,0.443333,0.443333
acc_test,0.438052,0.438052,0.438052
f_train,0.0,0.0,0.0
f_test,0.0,0.0,0.0


AdaBoostClassifier


Unnamed: 0,1%,10%,100%
train_time,0.612042,0.640214,0.402226
pred_time,513.179559,870.846713,821.463609
acc_train,0.723333,0.723333,0.723333
acc_test,0.662736,0.662736,0.662736
f_train,0.759898,0.759898,0.759898
f_test,0.706847,0.706847,0.706847


### Lazy Predict

In [None]:
#import lazypredict

#from lazypredict.Supervised import LazyClassifier


'''Approach to avoid errors in LazyPredict'''
#import os
#os.environ['KMP_DUPLICATE_LIB_OK']='True'


''''':: ERROR Message :: 
At a progress of 38%, this error message appears.
#Kernel Restarting
#The kernel for kickstarter_ml_Christian_ensemble.ipynb appears to have died. It will restart automatically.


'''

#clf = LazyClassifier(verbose=0,ignore_warnings=True, custom_metric=None)
#models, predictions = clf.fit(X_train, X_test, y_train, y_test)
#models