In [128]:
import kaggle
import pandas as pd
import numpy as np
import itertools
from collections import Counter
from collections import defaultdict
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import coverage_error
from random import choices
from random import seed


In [2]:
kaggle.api.authenticate()
kaggle.api.dataset_download_files('nvisagan/cannabis-strains-features', path='./cannabis-strains-features', unzip=True)

In [3]:
data = pd.read_csv('./cannabis-strains-features/Cannabis_Strains_Features.csv')


In [4]:
data


Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%..."
...,...,...,...,...,...,...
2346,Zeus-Og,hybrid,4.7,"Happy,Uplifted,Relaxed,Euphoric,Energetic","Earthy,Woody,Pine",Zeus OG is a hybrid cross between Pineapple OG...
2347,Zkittlez,indica,4.6,"Relaxed,Happy,Euphoric,Uplifted,Sleepy","Sweet,Berry,Grape",Zkittlez is an indica-dominant mix of Grape Ap...
2348,Zombie-Kush,indica,5.0,"Relaxed,Sleepy,Talkative,Euphoric,Happy","Earthy,Sweet,Spicy/Herbal",Zombie Kush by Ripper Seeds comes from two dif...
2349,Zombie-Og,indica,4.4,"Relaxed,Sleepy,Euphoric,Happy,Hungry","Sweet,Earthy,Pungent",If you’re looking to transform into a flesh-ea...


In [5]:
def cols_for_ranks(col_name):
    ranked_cols_appended = pd.concat([data, data[col_name].str.split(',', expand = True).add_prefix(col_name + '_')], axis = 1)
    return ranked_cols_appended

In [6]:
cols_for_ranks('Flavor')

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Flavor_0,Flavor_1,Flavor_2,Flavor_3
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,Earthy,Sweet,Citrus,
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,Flowery,Violet,Diesel,
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,Spicy/Herbal,Sage,Woody,
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Apricot,Citrus,Grapefruit,
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",Citrus,Earthy,Orange,
...,...,...,...,...,...,...,...,...,...,...
2346,Zeus-Og,hybrid,4.7,"Happy,Uplifted,Relaxed,Euphoric,Energetic","Earthy,Woody,Pine",Zeus OG is a hybrid cross between Pineapple OG...,Earthy,Woody,Pine,
2347,Zkittlez,indica,4.6,"Relaxed,Happy,Euphoric,Uplifted,Sleepy","Sweet,Berry,Grape",Zkittlez is an indica-dominant mix of Grape Ap...,Sweet,Berry,Grape,
2348,Zombie-Kush,indica,5.0,"Relaxed,Sleepy,Talkative,Euphoric,Happy","Earthy,Sweet,Spicy/Herbal",Zombie Kush by Ripper Seeds comes from two dif...,Earthy,Sweet,Spicy/Herbal,
2349,Zombie-Og,indica,4.4,"Relaxed,Sleepy,Euphoric,Happy,Hungry","Sweet,Earthy,Pungent",If you’re looking to transform into a flesh-ea...,Sweet,Earthy,Pungent,


In [7]:
Counter(cols_for_ranks('Flavor')['Flavor_3']).most_common()

[(None, 2308),
 ('Fruit', 16),
 ('Cheese', 5),
 ('Earthy', 4),
 ('Sweet', 3),
 ('Pine', 3),
 ('Flowery', 2),
 ('Chemical', 1),
 ('Blueberry', 1),
 ('Pineapple', 1),
 ('Pungent', 1),
 ('Berry', 1),
 ('Lemon', 1),
 ('Spicy/Herbal', 1),
 ('Orange', 1),
 ('Grape', 1),
 ('Apricot', 1)]

In [8]:
cols_for_ranks('Flavor')[cols_for_ranks('Flavor')['Flavor'].str.contains('Fruit')].head(5)

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Flavor_0,Flavor_1,Flavor_2,Flavor_3
32,Acid-Dough,sativa,5.0,"Talkative,Giggly,Happy,Hungry,Relaxed","Earthy,Woody,Tree Fruit",Acid Dough by Ripper Seeds is a sativa-dominan...,Earthy,Woody,Tree Fruit,
156,Bc-Sweet-Tooth,indica,4.3,"Uplifted,Happy,Relaxed,Sleepy,Euphoric","Sweet,Honey,Tree,Fruit","Developed in British Columbia by BC Bud Depot,...",Sweet,Honey,Tree,Fruit
163,Banana-Candy,indica,4.2,"Relaxed,Euphoric,Uplifted,Creative,Happy","Tree,Fruit,Earthy,Sweet",Banana Candy is classified as a Indica cannabi...,Tree,Fruit,Earthy,Sweet
166,Banana-Kush,hybrid,4.3,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Sweet,Tropical,Tree,Fruit",This legendary West Coast strain crosses Ghost...,Sweet,Tropical,Tree,Fruit
171,Bangi-Haze,sativa,0.0,"Aroused,Uplifted,Euphoric,Hungry","Berry,Tree,Fruit,Cheese",Bangi Haze by Ace Seeds is an energetic sativa...,Berry,Tree,Fruit,Cheese


In [9]:
cols_for_ranks('Flavor')[cols_for_ranks('Flavor')['Flavor'].str.contains(' ')]

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Flavor_0,Flavor_1,Flavor_2,Flavor_3
5,3-Bears-Og,indica,4.4,"Relaxed,Happy,Sleepy,Creative,Euphoric\n","Sweet, Pungent, Earthy",3 Bears OG by Mephisto Genetics is an autoflow...,Sweet,Pungent,Earthy,
32,Acid-Dough,sativa,5.0,"Talkative,Giggly,Happy,Hungry,Relaxed","Earthy,Woody,Tree Fruit",Acid Dough by Ripper Seeds is a sativa-dominan...,Earthy,Woody,Tree Fruit,
106,Amnesia-Ganja-Haze,sativa,5.0,"Euphoric, Relaxed","Spicy/Herbal, Sweet",Amnesia Ganja Haze is another award-winning st...,Spicy/Herbal,Sweet,,
1237,Las-Vegas-Purple-Kush-Bx,indica,5.0,"Sleepy,Happy,Relaxed,Aroused,Creative","Sweet, Berry, Spicy/Herbal",Las Vegas Purple Kush BX is a clone-only strai...,Sweet,Berry,Spicy/Herbal,


In [10]:
data['Flavor'] = data['Flavor'].str.replace(' ', '')
data['Flavor'] = data['Flavor'].str.replace('Tree,Fruit', 'TreeFruit')
cols_for_ranks('Flavor')

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Flavor_0,Flavor_1,Flavor_2,Flavor_3
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,Earthy,Sweet,Citrus,
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,Flowery,Violet,Diesel,
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,Spicy/Herbal,Sage,Woody,
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Apricot,Citrus,Grapefruit,
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",Citrus,Earthy,Orange,
...,...,...,...,...,...,...,...,...,...,...
2346,Zeus-Og,hybrid,4.7,"Happy,Uplifted,Relaxed,Euphoric,Energetic","Earthy,Woody,Pine",Zeus OG is a hybrid cross between Pineapple OG...,Earthy,Woody,Pine,
2347,Zkittlez,indica,4.6,"Relaxed,Happy,Euphoric,Uplifted,Sleepy","Sweet,Berry,Grape",Zkittlez is an indica-dominant mix of Grape Ap...,Sweet,Berry,Grape,
2348,Zombie-Kush,indica,5.0,"Relaxed,Sleepy,Talkative,Euphoric,Happy","Earthy,Sweet,Spicy/Herbal",Zombie Kush by Ripper Seeds comes from two dif...,Earthy,Sweet,Spicy/Herbal,
2349,Zombie-Og,indica,4.4,"Relaxed,Sleepy,Euphoric,Happy,Hungry","Sweet,Earthy,Pungent",If you’re looking to transform into a flesh-ea...,Sweet,Earthy,Pungent,


In [11]:
Counter(cols_for_ranks('Flavor')['Flavor_3']).most_common()

[(None, 2342),
 ('Cheese', 4),
 ('Earthy', 2),
 ('Chemical', 1),
 ('Blueberry', 1),
 ('Pungent', 1)]

In [12]:
cols_for_ranks('Flavor')[cols_for_ranks('Flavor')['Flavor_3'] == 'Cheese']

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Flavor_0,Flavor_1,Flavor_2,Flavor_3
195,Bettie-Page,hybrid,4.2,"Creative,Uplifted,Energetic,Euphoric,Relaxed","Earthy,Sweet,Blue,Cheese","Bettie Page, grown by Liberty Reach Farms in W...",Earthy,Sweet,Blue,Cheese
466,Cat-Piss,sativa,3.9,"Happy,Uplifted,Euphoric,Relaxed,Talkative","Earthy,Woody,Blue,Cheese",Originally a clone-only phenotype of Super Sil...,Earthy,Woody,Blue,Cheese
1141,Josh-D-Og,indica,3.7,"Aroused,Tingly,Uplifted,Creative,Euphoric","Berry,Blueberry,Blue,Cheese",Josh D OG by Karma Genetics is a handcrafted O...,Berry,Blueberry,Blue,Cheese
2330,X-Tra-Chz,hybrid,4.0,"Sleepy,Uplifted,Euphoric,Happy,Hungry","Pungent,Skunk,Blue,Cheese","X-tra Chz, bred by MTG Seeds, is a hybrid cros...",Pungent,Skunk,Blue,Cheese


In [13]:
data['Flavor'] = data['Flavor'].str.replace('Blue,Cheese', 'BlueCheese')
cols_for_ranks('Flavor')

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Flavor_0,Flavor_1,Flavor_2
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,Earthy,Sweet,Citrus
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,Flowery,Violet,Diesel
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,Spicy/Herbal,Sage,Woody
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Apricot,Citrus,Grapefruit
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",Citrus,Earthy,Orange
...,...,...,...,...,...,...,...,...,...
2346,Zeus-Og,hybrid,4.7,"Happy,Uplifted,Relaxed,Euphoric,Energetic","Earthy,Woody,Pine",Zeus OG is a hybrid cross between Pineapple OG...,Earthy,Woody,Pine
2347,Zkittlez,indica,4.6,"Relaxed,Happy,Euphoric,Uplifted,Sleepy","Sweet,Berry,Grape",Zkittlez is an indica-dominant mix of Grape Ap...,Sweet,Berry,Grape
2348,Zombie-Kush,indica,5.0,"Relaxed,Sleepy,Talkative,Euphoric,Happy","Earthy,Sweet,Spicy/Herbal",Zombie Kush by Ripper Seeds comes from two dif...,Earthy,Sweet,Spicy/Herbal
2349,Zombie-Og,indica,4.4,"Relaxed,Sleepy,Euphoric,Happy,Hungry","Sweet,Earthy,Pungent",If you’re looking to transform into a flesh-ea...,Sweet,Earthy,Pungent


In [14]:
set(cols_for_ranks('Flavor')['Flavor_1'])

{'Ammonia',
 'Apple',
 'Apricot',
 'Berry',
 'Berry\n',
 'Bluberry',
 'BlueCheese',
 'Blueberry',
 'Butter',
 'Cheese',
 'Chemical',
 'Chestnut',
 'Citrus',
 'Citrus\n',
 'Coffee',
 'Diesel',
 'Earthy',
 'Earthy\n',
 'Flowery',
 'Grape',
 'Grapefruit',
 'Honey',
 'Lavender',
 'Lemon',
 'Lime',
 'Mango',
 'Menthol',
 'Mint',
 'Minty',
 None,
 'Nutty',
 'Orange',
 'Pear',
 'Pepper',
 'Pine',
 'Pineapple',
 'Plum',
 'Pungent',
 'Rose',
 'Sage',
 'Skunk',
 'Spicy/Herbal',
 'Strawberry',
 'Sweet',
 'Sweet\n',
 'Tea',
 'Tobacco',
 'TreeFruit',
 'Tropical',
 'Vanilla',
 'Violet',
 'Woody'}

In [15]:
data['Flavor'] = data['Flavor'].str.replace('\n', '')
data['Flavor'] = data['Flavor'].str.replace('Bluberry', 'Blueberry')
data['Flavor'] = data['Flavor'].str.replace('Grapes', 'Grape')
data['Flavor'] = data['Flavor'].str.replace('Minty', 'Mint')
data.drop(data[data.Flavor.str.contains('None')].index, axis = 0, inplace = True)
cols_for_ranks('Flavor')

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Flavor_0,Flavor_1,Flavor_2
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,Earthy,Sweet,Citrus
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,Flowery,Violet,Diesel
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,Spicy/Herbal,Sage,Woody
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Apricot,Citrus,Grapefruit
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",Citrus,Earthy,Orange
...,...,...,...,...,...,...,...,...,...
2346,Zeus-Og,hybrid,4.7,"Happy,Uplifted,Relaxed,Euphoric,Energetic","Earthy,Woody,Pine",Zeus OG is a hybrid cross between Pineapple OG...,Earthy,Woody,Pine
2347,Zkittlez,indica,4.6,"Relaxed,Happy,Euphoric,Uplifted,Sleepy","Sweet,Berry,Grape",Zkittlez is an indica-dominant mix of Grape Ap...,Sweet,Berry,Grape
2348,Zombie-Kush,indica,5.0,"Relaxed,Sleepy,Talkative,Euphoric,Happy","Earthy,Sweet,Spicy/Herbal",Zombie Kush by Ripper Seeds comes from two dif...,Earthy,Sweet,Spicy/Herbal
2349,Zombie-Og,indica,4.4,"Relaxed,Sleepy,Euphoric,Happy,Hungry","Sweet,Earthy,Pungent",If you’re looking to transform into a flesh-ea...,Sweet,Earthy,Pungent


In [16]:
cols_for_ranks('Effects')

Unnamed: 0,Strain,Type,Rating,Effects,Flavor,Description,Effects_0,Effects_1,Effects_2,Effects_3,Effects_4
0,100-Og,hybrid,4.0,"Creative,Energetic,Tingly,Euphoric,Relaxed","Earthy,Sweet,Citrus",$100 OG is a 50/50 hybrid strain that packs a ...,Creative,Energetic,Tingly,Euphoric,Relaxed
1,98-White-Widow,hybrid,4.7,"Relaxed,Aroused,Creative,Happy,Energetic","Flowery,Violet,Diesel",The ‘98 Aloha White Widow is an especially pot...,Relaxed,Aroused,Creative,Happy,Energetic
2,1024,sativa,4.4,"Uplifted,Happy,Relaxed,Energetic,Creative","Spicy/Herbal,Sage,Woody",1024 is a sativa-dominant hybrid bred in Spain...,Uplifted,Happy,Relaxed,Energetic,Creative
3,13-Dawgs,hybrid,4.2,"Tingly,Creative,Hungry,Relaxed,Uplifted","Apricot,Citrus,Grapefruit",13 Dawgs is a hybrid of G13 and Chemdawg genet...,Tingly,Creative,Hungry,Relaxed,Uplifted
4,24K-Gold,hybrid,4.6,"Happy,Relaxed,Euphoric,Uplifted,Talkative","Citrus,Earthy,Orange","Also known as Kosher Tangie, 24k Gold is a 60%...",Happy,Relaxed,Euphoric,Uplifted,Talkative
...,...,...,...,...,...,...,...,...,...,...,...
2346,Zeus-Og,hybrid,4.7,"Happy,Uplifted,Relaxed,Euphoric,Energetic","Earthy,Woody,Pine",Zeus OG is a hybrid cross between Pineapple OG...,Happy,Uplifted,Relaxed,Euphoric,Energetic
2347,Zkittlez,indica,4.6,"Relaxed,Happy,Euphoric,Uplifted,Sleepy","Sweet,Berry,Grape",Zkittlez is an indica-dominant mix of Grape Ap...,Relaxed,Happy,Euphoric,Uplifted,Sleepy
2348,Zombie-Kush,indica,5.0,"Relaxed,Sleepy,Talkative,Euphoric,Happy","Earthy,Sweet,Spicy/Herbal",Zombie Kush by Ripper Seeds comes from two dif...,Relaxed,Sleepy,Talkative,Euphoric,Happy
2349,Zombie-Og,indica,4.4,"Relaxed,Sleepy,Euphoric,Happy,Hungry","Sweet,Earthy,Pungent",If you’re looking to transform into a flesh-ea...,Relaxed,Sleepy,Euphoric,Happy,Hungry


In [17]:
data['Effects'] = data['Effects'].str.replace('\n', '')
data['Effects'] = data['Effects'].str.replace(' ', '')
data['Effects'] = data['Effects'].str.replace('Dry,Mouth', 'DryMouth')
data['Effects'] = data['Effects'].str.replace('Energentic', 'Energetic')

data.drop(data[data.Effects.str.contains('None')].index, axis = 0, inplace = True)

In [18]:
def binarize_data(col_name, var_type):
    binarizer = MultiLabelBinarizer()
    bin_data = pd.DataFrame(binarizer.fit_transform(data[col_name].str.split(',')), columns=binarizer.classes_, index=data.index).add_prefix(var_type + '_')
    return bin_data


In [19]:
features = binarize_data('Flavor', 'feature')
targets = binarize_data('Effects', 'target')
targets = targets.drop(['target_DryMouth'], axis = 1)

In [None]:
#SINGLE-OUTPUT

In [25]:
targets.sum()

target_Aroused       205
target_Creative      763
target_Energetic     657
target_Euphoric     1658
target_Focused       605
target_Giggly        307
target_Happy        1888
target_Hungry        483
target_Relaxed      1749
target_Sleepy        752
target_Talkative     365
target_Tingly        350
target_Uplifted     1524
dtype: int64

In [84]:
def null_rand_draw_single(test_samp_size, target_column):
    preds = choices([1,0], cum_weights=[targets[target_column].values.sum(), len(targets)], k=test_samp_size)
    return preds

In [85]:
def null_coin_flip_single(test_samp_size):
    preds = choices([1,0], cum_weights=[1, 2], k=test_samp_size)
    return preds

In [86]:
def null_model_single(model, target_column, targets_test):
    if model == 'null_rand_draw_single':
        targets_test_pred = null_rand_draw_single(len(targets_test), target_column)
    elif model == 'null_coin_flip_single':
        targets_test_pred = null_coin_flip_single(len(targets_test))
    else:
        print('unsupported model, please try again.')
    
    accuracy = 100 * sum([1 if x == y else 0 for x,y in zip(targets_test_pred,list(targets_test))])/len(targets_test)
        
    return accuracy

In [115]:
def RF_single(feats_train, targets_train, feats_test, targets_test):
    
    rf = RandomForestClassifier(random_state=10)
    rf.fit(feats_train, targets_train)
    accuracy = 100 * rf.score(feats_test, targets_test)
    accuracy_custom = 100 * sum([1 if x == y else 0 for x,y in zip(list(rf.predict(feats_test)),list(targets_test))])/len(targets_test)
    
    return accuracy


In [116]:
#select target column
target_column = 'target_Creative'
##########################################################################
#leave out test data for final evaluation
feats_train, feats_final_test, targets_train, targets_final_test = train_test_split(features, targets[target_column], test_size=0.09, random_state=10)
print('size of final test set: ' + str(len(feats_final_test)))

size of final test set: 207


In [143]:
seed(10)
kf = KFold(n_splits=10)
kf.get_n_splits(feats_train)

fold_num = 0
models = []
cv_results = pd.DataFrame()

for cvtrain_index, cvtest_index in kf.split(feats_train):
    fold_scores = []
    fold_num +=1
    print("fold " + str(fold_num))
    feats_cvtrain, feats_cvtest = feats_train.iloc[cvtrain_index], feats_train.iloc[cvtest_index]
    targets_cvtrain, targets_cvtest = targets_train.iloc[cvtrain_index], targets_train.iloc[cvtest_index]
    
    print('size of test set: ' + str(len(feats_cvtest)))
    
    print('-')
    print('NULL MODEL - SAMPLE-WEIGHTED RANDOM')
    if fold_num == 1:
        models.append('model_0')
    fold_scores.append(null_model_single('null_rand_draw_single', target_column, targets_cvtest))
    
    print('-')
    print('NULL MODEL - COIN FLIP')
    if fold_num == 1:
        models.append('model_00')
    fold_scores.append(null_model_single('null_coin_flip_single', target_column, targets_cvtest))
        
    print('-')
    print('RANDOM FOREST - DEFAULT')
    
    if fold_num == 1:
        models.append('model_1')
    fold_scores.append(RF_single(feats_cvtrain, targets_cvtrain, feats_cvtest, targets_cvtest))
    
    cv_results['fold_' + str(fold_num)] = fold_scores
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

cv_results['models'] = [m for mods in models for m in 1*[mods]]
cv_results['measure'] = ['accuracy'] * len(models)
cv_results['mean'] = list(round(cv_results.T[cv_results.columns.str.contains('fold')].mean(),2))
cv_results['std'] = list(round(cv_results.T[cv_results.columns.str.contains('fold')].std(),2))

fold 1
size of test set: 209
-
NULL MODEL - SAMPLE-WEIGHTED RANDOM
-
NULL MODEL - COIN FLIP
-
RANDOM FOREST - DEFAULT
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fold 2
size of test set: 209
-
NULL MODEL - SAMPLE-WEIGHTED RANDOM
-
NULL MODEL - COIN FLIP
-
RANDOM FOREST - DEFAULT
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fold 3
size of test set: 209
-
NULL MODEL - SAMPLE-WEIGHTED RANDOM
-
NULL MODEL - COIN FLIP
-
RANDOM FOREST - DEFAULT
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fold 4
size of test set: 209
-
NULL MODEL - SAMPLE-WEIGHTED RANDOM
-
NULL MODEL - COIN FLIP
-
RANDOM FOREST - DEFAULT
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fold 5
size of test set: 209
-
NULL MODEL - SAMPLE-WEIGHTED RANDOM
-
NULL MODEL - COIN FLIP
-
RANDOM FOREST - DEFAULT
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
fold 6
size of test set: 209
-
NULL MODEL - SAMPLE-WEIGHTED RANDOM
-
NULL MODEL - COIN FLIP
-
RANDOM FOREST - DEFAU

In [144]:
cv_results

Unnamed: 0,fold_1,fold_2,fold_3,fold_4,fold_5,fold_6,fold_7,fold_8,fold_9,fold_10,models,measure,mean,std
0,54.066986,56.45933,56.45933,64.114833,59.330144,50.239234,49.760766,60.76555,58.173077,52.884615,model_0,accuracy,56.23,4.59
1,49.282297,48.803828,52.631579,46.889952,48.803828,53.110048,49.760766,50.239234,45.192308,49.038462,model_00,accuracy,49.38,2.35
2,64.114833,59.808612,59.808612,67.942584,63.157895,63.636364,63.157895,59.808612,65.865385,66.346154,model_1,accuracy,63.36,2.88


In [119]:
pd.pivot_table(cv_results, values='mean', index='models',
                    columns='measure', aggfunc=np.sum)

measure,accuracy
models,Unnamed: 1_level_1
model_0,56.66
model_00,52.25
model_1,63.36


In [120]:
def RF_single_hps(feats_train, targets_train, feats_test, targets_test, hyperparameters):
    
    rf = RandomForestClassifier(**hyperparameters)
    rf.fit(feats_train, targets_train)
    accuracy = 100 * rf.score(feats_test, targets_test)
    accuracy_custom = 100 * sum([1 if x == y else 0 for x,y in zip(list(rf.predict(feats_test)),list(targets_test))])/len(targets_test)
    
    return accuracy

In [158]:
# bootstrap_vals = [True]
# ccp_alpha_vals = [0.0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
# class_weight_vals = [None, 'balanced', 'balanced_subsample']
# criterion_vals = ['gini', 'entropy']
# max_depth_vals = [None, 1, 2, 3, 5, 8]
# max_features_vals = [3, 5, 8, 12]
# max_leaf_nodes_vals = [None, 25, 50, 100, 200, 500, 1000]
# max_samples_vals = [None]
# min_impurity_decrease_vals = [0.0]
# min_impurity_split_vals = [None]
# min_samples_leaf_vals = [1, 3, 5]
# min_samples_split_vals = [2, 6, 10, 20]
# min_weight_fraction_leaf_vals = [0.0]
# n_estimators_vals = [50, 100, 500, 1000, 5000]
# n_jobs_vals = [6]
# oob_score_vals = [False]
# random_state_vals = [10]
# verbose_vals = [0]
# warm_start_vals = [False]

bootstrap_vals = [True]
ccp_alpha_vals = [0, 0.1, 1, 10]
class_weight_vals = [None, 'balanced', 'balanced_subsample']
criterion_vals = ['entropy']
max_depth_vals = [None, 1, 2, 10]
max_features_vals = [24, 12, 6, 3]
max_leaf_nodes_vals = [None]
max_samples_vals = [None]
min_impurity_decrease_vals = [0.0]
min_impurity_split_vals = [None]
min_samples_leaf_vals = [1]
min_samples_split_vals = [2]
min_weight_fraction_leaf_vals = [0.0]
n_estimators_vals = [100, 1000]
n_jobs_vals = [6]
oob_score_vals = [False]
random_state_vals = [10]
verbose_vals = [0]
warm_start_vals = [False]

hyperparameters_vals = [bootstrap_vals, ccp_alpha_vals, class_weight_vals, 
                        criterion_vals, max_depth_vals, max_features_vals, 
                        max_leaf_nodes_vals, max_samples_vals, min_impurity_decrease_vals, 
                        min_impurity_split_vals, min_samples_leaf_vals, min_samples_split_vals, 
                        min_weight_fraction_leaf_vals, n_estimators_vals, n_jobs_vals, 
                        oob_score_vals, random_state_vals, verbose_vals, warm_start_vals]

In [159]:
len(list(itertools.product(*hyperparameters_vals)))

384

In [None]:
seed(10)
kf = KFold(n_splits=10)
kf.get_n_splits(feats_train)

fold_num = 0
models = []
cv_results = pd.DataFrame()

for cvtrain_index, cvtest_index in kf.split(feats_train):
    fold_scores = []
    fold_num +=1
    print("fold " + str(fold_num))
    feats_cvtrain, feats_cvtest = feats_train.iloc[cvtrain_index], feats_train.iloc[cvtest_index]
    targets_cvtrain, targets_cvtest = targets_train.iloc[cvtrain_index], targets_train.iloc[cvtest_index]
    
    print('size of test set: ' + str(len(feats_cvtest)))
    
    print('-')
    print('NULL MODEL - SAMPLE-WEIGHTED RANDOM')
    if fold_num == 1:
        models.append('model_0')
    fold_scores.append(null_model_single('null_rand_draw_single', target_column, targets_cvtest))
    
    print('-')
    print('NULL MODEL - COIN FLIP')
    if fold_num == 1:
        models.append('model_00')
    fold_scores.append(null_model_single('null_coin_flip_single', target_column, targets_cvtest))
        
    print('-')
    print('RANDOM FOREST - DEFAULT')
    
    if fold_num == 1:
        models.append('model_1')
    fold_scores.append(RF_single(feats_cvtrain, targets_cvtrain, feats_cvtest, targets_cvtest))
    
    model_hp_record = []
    hps = list(rf.get_params().keys())
    hp_search_mod_num = 1
    for model_val_set in list(itertools.product(*hyperparameters_vals)):
        hp_search_mod_num +=1
        model_hp_record.append(('model_' + str(hp_search_mod_num), model_val_set))
        hyperparams = dict(zip(hps, model_val_set))
        print('-')
        print('RANDOM FOREST - model_' + str(hp_search_mod_num))
        if fold_num == 1:
            models.append('model_' + str(hp_search_mod_num))
        fold_scores.append(RF_single_hps(feats_cvtrain, targets_cvtrain, feats_cvtest, targets_cvtest, hyperparams))
    
    cv_results['fold_' + str(fold_num)] = fold_scores
    
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

cv_results['models'] = [m for mods in models for m in 1*[mods]]
cv_results['measure'] = ['accuracy'] * len(models)
cv_results['mean'] = list(round(cv_results.T[cv_results.columns.str.contains('fold')].mean(),2))
cv_results['std'] = list(round(cv_results.T[cv_results.columns.str.contains('fold')].std(),2))


fold 1
size of test set: 209
-
NULL MODEL - SAMPLE-WEIGHTED RANDOM
-
NULL MODEL - COIN FLIP
-
RANDOM FOREST - DEFAULT
-
RANDOM FOREST - model_2
-
RANDOM FOREST - model_3
-
RANDOM FOREST - model_4
-
RANDOM FOREST - model_5
-
RANDOM FOREST - model_6
-
RANDOM FOREST - model_7
-
RANDOM FOREST - model_8
-
RANDOM FOREST - model_9
-
RANDOM FOREST - model_10
-
RANDOM FOREST - model_11
-
RANDOM FOREST - model_12
-
RANDOM FOREST - model_13
-
RANDOM FOREST - model_14
-
RANDOM FOREST - model_15
-
RANDOM FOREST - model_16
-
RANDOM FOREST - model_17
-
RANDOM FOREST - model_18
-
RANDOM FOREST - model_19
-
RANDOM FOREST - model_20
-
RANDOM FOREST - model_21
-
RANDOM FOREST - model_22
-
RANDOM FOREST - model_23
-
RANDOM FOREST - model_24
-
RANDOM FOREST - model_25
-
RANDOM FOREST - model_26
-
RANDOM FOREST - model_27
-
RANDOM FOREST - model_28
-
RANDOM FOREST - model_29
-
RANDOM FOREST - model_30
-
RANDOM FOREST - model_31
-
RANDOM FOREST - model_32
-
RANDOM FOREST - model_33
-
RANDOM FOREST - model_34

-
RANDOM FOREST - model_295
-
RANDOM FOREST - model_296
-
RANDOM FOREST - model_297
-
RANDOM FOREST - model_298
-
RANDOM FOREST - model_299
-
RANDOM FOREST - model_300
-
RANDOM FOREST - model_301
-
RANDOM FOREST - model_302
-
RANDOM FOREST - model_303
-
RANDOM FOREST - model_304
-
RANDOM FOREST - model_305
-
RANDOM FOREST - model_306
-
RANDOM FOREST - model_307
-
RANDOM FOREST - model_308
-
RANDOM FOREST - model_309
-
RANDOM FOREST - model_310
-
RANDOM FOREST - model_311
-
RANDOM FOREST - model_312
-
RANDOM FOREST - model_313
-
RANDOM FOREST - model_314
-
RANDOM FOREST - model_315
-
RANDOM FOREST - model_316
-
RANDOM FOREST - model_317
-
RANDOM FOREST - model_318
-
RANDOM FOREST - model_319
-
RANDOM FOREST - model_320
-
RANDOM FOREST - model_321
-
RANDOM FOREST - model_322
-
RANDOM FOREST - model_323
-
RANDOM FOREST - model_324
-
RANDOM FOREST - model_325
-
RANDOM FOREST - model_326
-
RANDOM FOREST - model_327
-
RANDOM FOREST - model_328
-
RANDOM FOREST - model_329
-
RANDOM FOREST - mo

-
RANDOM FOREST - model_202
-
RANDOM FOREST - model_203
-
RANDOM FOREST - model_204
-
RANDOM FOREST - model_205
-
RANDOM FOREST - model_206
-
RANDOM FOREST - model_207
-
RANDOM FOREST - model_208
-
RANDOM FOREST - model_209
-
RANDOM FOREST - model_210
-
RANDOM FOREST - model_211
-
RANDOM FOREST - model_212
-
RANDOM FOREST - model_213
-
RANDOM FOREST - model_214
-
RANDOM FOREST - model_215
-
RANDOM FOREST - model_216
-
RANDOM FOREST - model_217
-
RANDOM FOREST - model_218
-
RANDOM FOREST - model_219
-
RANDOM FOREST - model_220
-
RANDOM FOREST - model_221
-
RANDOM FOREST - model_222
-
RANDOM FOREST - model_223
-
RANDOM FOREST - model_224
-
RANDOM FOREST - model_225
-
RANDOM FOREST - model_226
-
RANDOM FOREST - model_227
-
RANDOM FOREST - model_228
-
RANDOM FOREST - model_229
-
RANDOM FOREST - model_230
-
RANDOM FOREST - model_231
-
RANDOM FOREST - model_232
-
RANDOM FOREST - model_233
-
RANDOM FOREST - model_234
-
RANDOM FOREST - model_235
-
RANDOM FOREST - model_236
-
RANDOM FOREST - mo

In [None]:
cv_results.sort_values(mean, ascending=False)

In [157]:
hp_lookup = pd.DataFrame()
for x in range(len(model_hp_record)):
    hp_lookup[model_hp_record[x][0]] = list(model_hp_record[x][1])
hp_lookup    

Unnamed: 0,model_2,model_3,model_4,model_5,model_6,model_7,model_8,model_9
0,True,True,True,True,True,True,True,True
1,0,0,0.1,0.1,1,1,10,10
2,,,,,,,,
3,gini,entropy,gini,entropy,gini,entropy,gini,entropy
4,,,,,,,,
5,auto,auto,auto,auto,auto,auto,auto,auto
6,,,,,,,,
7,,,,,,,,
8,0,0,0,0,0,0,0,0
9,,,,,,,,


In [155]:
model_hp_record[0][0]

('model_2',
 (True,
  0,
  None,
  'gini',
  None,
  'auto',
  None,
  None,
  0.0,
  None,
  1,
  2,
  0.0,
  100,
  6,
  False,
  10,
  0,
  False))

In [None]:
# MULTI-OUTPUT

In [None]:
#leave out test data for final evaluation
feats_train, feats_final_test, targets_train, targets_final_test = train_test_split(features, targets, test_size=0.09, random_state=10)
print('size of final test set: ' + str(len(feats_final_test)))

In [None]:
def null_rand_draw(test_samp_size):
    preds = []
    for x in range(test_samp_size):
        randpred = list(np.random.choice(a = list((np.sum(targets)/np.sum(np.sum(targets))).index), size = 5, replace = False, p = list((np.sum(targets)/np.sum(np.sum(targets))).values)))
        preds.append([1 if target in randpred else 0 for target in list(targets.columns)])
    return preds

In [None]:
def null_top_5(test_samp_size):
    top5 = list(np.sum(targets).sort_values(ascending = False).head(5).index)
    preds = [[1 if target in top5 else 0 for target in list(targets.columns)]] * test_samp_size
    return preds

In [None]:
def null_model(model, targets_test):
    if model == 'null_rand_draw':
        targets_test_pred = null_rand_draw(len(targets_test))
    elif model == 'null_top_5':
        targets_test_pred = null_top_5(len(targets_test))
    else:
        print('unsupported model, please try again.')
        

    scores = []
    for x in targets_test.values - targets_test_pred:
        scores.append(list(x).count(0))
#     print('subset accuracy: ' + str(round(scores.count(len(targets.columns))/len(scores)*100, 2)) + '%')
    subset_accuracy = scores.count(len(targets.columns))/len(scores)*100

#     print('coverage error: ' + str(round(coverage_error(targets_test, targets_test_pred), 2)))
    coverage_error_score = coverage_error(targets_test, targets_test_pred)

    scores = []
    for x in targets_test.values + targets_test_pred:
        scores.append(list(x).count(2))
#     print('avg number of correct labels: ' + str(round(np.mean(scores), 2)))
    avg_num_correct_labels = np.mean(scores)
    
    return subset_accuracy, coverage_error_score, avg_num_correct_labels

In [None]:
def RF_3x_metrics(feats_train, targets_train, feats_test, targets_test):
    
    rf = RandomForestClassifier(random_state=10)
    rf.fit(feats_train, targets_train)
#     print('subset accuracy: ' + str(round(100 * rf.score(feats_test, targets_test), 2)) + '%')
    subset_accuracy = 100 * rf.score(feats_test, targets_test)

    targets_test_pred = rf.predict(feats_test)
#     print('coverage error: ' + str(round(coverage_error(targets_test, targets_test_pred), 2)))
    coverage_error_score = coverage_error(targets_test, targets_test_pred)

    scores = []
    for x in targets_test.values + targets_test_pred:
        scores.append(list(x).count(2))
#     print('avg number of correct labels: ' + str(round(np.mean(scores), 2)))
    avg_num_correct_labels = np.mean(scores)
    
    return subset_accuracy, coverage_error_score, avg_num_correct_labels


In [None]:
kf = KFold(n_splits=10)
kf.get_n_splits(feats_train)

fold_num = 0
models = []
cv_results = pd.DataFrame()

for cvtrain_index, cvtest_index in kf.split(feats_train):
    fold_scores = []
    fold_num +=1
    print("fold " + str(fold_num))
    feats_cvtrain, feats_cvtest = feats_train.iloc[cvtrain_index], feats_train.iloc[cvtest_index]
    targets_cvtrain, targets_cvtest = targets_train.iloc[cvtrain_index], targets_train.iloc[cvtest_index]
    
    print('size of test set: ' + str(len(feats_cvtest)))
    
    print('-')
    print('NULL MODEL - SAMPLE-WEIGHTED RANDOM')
    if fold_num == 1:
        models.append('model_0')
    fold_scores.extend(list(null_model('null_rand_draw', targets_cvtest)))
    
    print('-')
    print('NULL MODEL - SELECT TOP 5 TARGETS')
    if fold_num == 1:
        models.append('model_00')
    fold_scores.extend(list(null_model('null_top_5', targets_cvtest)))
        
    print('-')
    print('RANDOM FOREST - DEFAULT')
    if fold_num == 1:
        models.append('model_1')
    fold_scores.extend(list(RF_3x_metrics(feats_cvtrain, targets_cvtrain, feats_cvtest, targets_cvtest)))
    
    cv_results['fold_' + str(fold_num)] = fold_scores
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

cv_results['models'] = [m for mods in models for m in 3*[mods]]
cv_results['measure'] = ['subset_accuracy', 'coverage_error', 'avg_num_correct_labels'] * len(models)
cv_results['mean'] = list(round(cv_results.T[cv_results.columns.str.contains('fold')].mean(),2))
cv_results['std'] = list(round(cv_results.T[cv_results.columns.str.contains('fold')].std(),2))

In [None]:
cv_results

In [None]:
pd.pivot_table(cv_results, values='mean', index='models',
                    columns='measure', aggfunc=np.sum)

19 parameters!! Luckily, we do not need to optimize for some of them:

'n_jobs': None, -1 (all)
for running in parallel

'oob_score': False,
measure of generalization accuracy

'random_state': 10,
'verbose': 0,
'warm_start': False
'min_weight_fraction_leaf': 0.0,

'bootstrap': True
use bootstrap to select which samples are used in tree




In [None]:
rf.get_params()

In [None]:
def RF_3x_metrics_hps(feats_train, targets_train, feats_test, targets_test, hyperparameters):
    
    rf = RandomForestClassifier(**hyperparameters)
    rf.fit(feats_train, targets_train)
#     print('subset accuracy: ' + str(round(100 * rf.score(feats_test, targets_test), 2)) + '%')
    subset_accuracy = 100 * rf.score(feats_test, targets_test)

    targets_test_pred = rf.predict(feats_test)
#     print('coverage error: ' + str(round(coverage_error(targets_test, targets_test_pred), 2)))
    coverage_error_score = coverage_error(targets_test, targets_test_pred)

    scores = []
    for x in targets_test.values + targets_test_pred:
        scores.append(list(x).count(2))
#     print('avg number of correct labels: ' + str(round(np.mean(scores), 2)))
    avg_num_correct_labels = np.mean(scores)
    
    return subset_accuracy, coverage_error_score, avg_num_correct_labels

In [None]:
# bootstrap_vals = [True]
# ccp_alpha_vals = [0.0, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
# class_weight_vals = [None, 'balanced', 'balanced_subsample']
# criterion_vals = ['gini', 'entropy']
# max_depth_vals = [None, 1, 2, 3, 5, 8]
# max_features_vals = [3, 5, 8, 12]
# max_leaf_nodes_vals = [None, 25, 50, 100, 200, 500, 1000]
# max_samples_vals = [None]
# min_impurity_decrease_vals = [0.0]
# min_impurity_split_vals = [None]
# min_samples_leaf_vals = [1, 3, 5]
# min_samples_split_vals = [2, 6, 10, 20]
# min_weight_fraction_leaf_vals = [0.0]
# n_estimators_vals = [50, 100, 500, 1000, 5000]
# n_jobs_vals = [6]
# oob_score_vals = [False]
# random_state_vals = [10]
# verbose_vals = [0]
# warm_start_vals = [False]

bootstrap_vals = [True]
ccp_alpha_vals = [0.0005, 0.0007, 0.001, 0.0015, 0.002, 20, 25, 30]
class_weight_vals = [None]
criterion_vals = ['entropy']
max_depth_vals = [None]
max_features_vals = ['auto']
max_leaf_nodes_vals = [None]
max_samples_vals = [None]
min_impurity_decrease_vals = [0.0]
min_impurity_split_vals = [None]
min_samples_leaf_vals = [1]
min_samples_split_vals = [2]
min_weight_fraction_leaf_vals = [0.0]
n_estimators_vals = [100]
n_jobs_vals = [6]
oob_score_vals = [False]
random_state_vals = [10]
verbose_vals = [0]
warm_start_vals = [False]

hyperparameters_vals = [bootstrap_vals, ccp_alpha_vals, class_weight_vals, \
                        criterion_vals, max_depth_vals, max_features_vals, \
                        max_leaf_nodes_vals, max_samples_vals, min_impurity_decrease_vals, \
                        min_impurity_split_vals, min_samples_leaf_vals, min_samples_split_vals, \
                        min_weight_fraction_leaf_vals, n_estimators_vals, n_jobs_vals, \
                        oob_score_vals, random_state_vals, verbose_vals, warm_start_vals]\


In [None]:
len(list(itertools.product(*hyperparameters_vals)))


In [None]:
kf = KFold(n_splits=10)
kf.get_n_splits(feats_train)

fold_num = 0
models = []
cv_results = pd.DataFrame()
rf = RandomForestClassifier()

for cvtrain_index, cvtest_index in kf.split(feats_train):
    fold_scores = []
    fold_num +=1
    print("fold " + str(fold_num))
    feats_cvtrain, feats_cvtest = feats_train.iloc[cvtrain_index], feats_train.iloc[cvtest_index]
    targets_cvtrain, targets_cvtest = targets_train.iloc[cvtrain_index], targets_train.iloc[cvtest_index]
    
    print('size of test set: ' + str(len(feats_cvtest)))
    
    print('-')
    print('NULL MODEL - SAMPLE-WEIGHTED RANDOM')
    if fold_num == 1:
        models.append('model_0')
    fold_scores.extend(list(null_model('null_rand_draw', targets_cvtest)))
    
    print('-')
    print('NULL MODEL - SELECT TOP 5 TARGETS')
    if fold_num == 1:
        models.append('model_00')
    fold_scores.extend(list(null_model('null_top_5', targets_cvtest)))
        
    print('-')
    print('RANDOM FOREST - DEFAULT')
    if fold_num == 1:
        models.append('model_1')
    fold_scores.extend(list(RF_3x_metrics(feats_cvtrain, targets_cvtrain, feats_cvtest, targets_cvtest)))
    
    model_hp_record = []
    hps = list(rf.get_params().keys())
    hp_search_mod_num = 1
    for model_val_set in list(itertools.product(*hyperparameters_vals)):
        hp_search_mod_num +=1
        model_hp_record.append(('model_' + str(hp_search_mod_num), model_val_set))
        hyperparams = dict(zip(hps, model_val_set))
        print('-')
        print('RANDOM FOREST - model_' + str(hp_search_mod_num))
        if fold_num == 1:
            models.append('model_' + str(hp_search_mod_num))
        fold_scores.extend(list(RF_3x_metrics_hps(feats_cvtrain, targets_cvtrain, feats_cvtest, targets_cvtest, hyperparams)))
        
        
    
    cv_results['fold_' + str(fold_num)] = fold_scores
    print('~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~')

cv_results['models'] = [m for mods in models for m in 3*[mods]]
cv_results['measure'] = ['subset_accuracy', 'coverage_error', 'avg_num_correct_labels'] * len(models)
cv_results['mean'] = list(round(cv_results.T[cv_results.columns.str.contains('fold')].mean(),2))
cv_results['std'] = list(round(cv_results.T[cv_results.columns.str.contains('fold')].std(),2))

In [None]:
cv_results

In [None]:
pd.pivot_table(cv_results, values='mean', index='models',
                    columns='measure', aggfunc=np.sum).sort_values(by='avg_num_correct_labels', ascending = False)

In [None]:
cv_results_top = pd.DataFrame()

In [None]:
cv_results_top['model_1'] = list(rf.get_params().values())

In [None]:
model_number = 7
print('model_' + str(model_number))
list(itertools.product(*hyperparameters_vals))[model_number - 2]
cv_results_top['model_' + str(model_number)] = list(itertools.product(*hyperparameters_vals))[model_number - 2]
#hyperparams = dict(zip(hps, list(itertools.product(*hyperparameters_vals))[model_number - 2]))
#hyperparams


In [None]:
cv_results_top.index = hps

In [None]:
cv_results_top

In [None]:
model_hp_record

In [None]:
cv_results