# Libraries and functions

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from colored import fore, back, style, fg, bg, attr

pd.set_option("display.max_rows",    20000)
pd.set_option("display.max_columns", 20000)
import warnings
warnings.filterwarnings('ignore')

In [2]:
file_name3 = 'SS046'
file_name   = file_name3 # ref file SSFeb07c.ipynb

In [3]:
demarc = '='*127
seed = 42
def printd(string):
    '''
    print the passed string with demarcation lines and slight coloration, above and below,
    to improve the presentation and look of the output.
    '''
    blanks = len(demarc) - len(string)
    color = fg('#02314a') + bg('#edffe3') + attr('bold')
    res = attr('reset')
    print (color +
       (demarc+'\n' + string + ' '*blanks + '\n'+demarc)
       + res)
############################################   END OF FUNCTION   ##################################
printd('"printd" ready')

"printd" ready                                                                                                                 


In [4]:
def feature_generator(df): # ABBREVIATED VERSION
    '''
    Feature generator + - /    
    '''
    col_list = list(df.columns)
    n_pre = len(col_list)
    
    # Log and 1/log
    for p in col_list:
        if df[p].isin([0]).sum(axis=0) == 0:
            try:
                feature_name        = 'log('     +   p  + ')'  # log of the column
                df[feature_name]    = np.log(df[p])
            except:
                pass
        
    # Sqrt and 1/sqrt
    for p in col_list:
        try:
            feature_name            = 'sqrt('     +   p  + ')'  # sqrt of the column
            df[feature_name]        = np.sqrt(df[p])
        except:
            pass
     
    # Additive, Divisive
    col_list = list(df.columns)
    for p in col_list:
        idx = col_list.index(p)
        if df[p].isin([0]).sum(axis=0) == 0:
            for q in range(idx+1, len(col_list)):
                if df[col_list[q]].isin([0]).sum(axis=0) == 0:           
                    feature_name_1      = p            +    '+'   +  col_list[q] # plus
                    feature_name_2      = p            +    '-'   +  col_list[q] # plus
                    feature_name_3      = col_list[q]  +    '*'   +  p           # minus 2
                    feature_name_4      = p            +    '/'   +  col_list[q] # divide 1

                    df[feature_name_1]  = df[p]             +        df[col_list[q]]
                    df[feature_name_2]  = df[p]             -        df[col_list[q]]
                    df[feature_name_3]  = df[col_list[q]]   *        df[p]           
                    df[feature_name_4]  = df[p]             /        df[col_list[q]]
    
    df = df.dropna(axis=1)
    
    print('Number of columns before feature generation: ',n_pre)
    print('Number of columns after  feature generation: ',df.shape[1])
    return df
######################################################################################################
printd('"feature_generator" ready')

"feature_generator" ready                                                                                                      


In [5]:
def PolynomialFeatures_labeled(input_df,power):
    '''
    Inputs:
    input_df = Your labeled pandas dataframe (list of x's not raised to any power) 
    power = what order polynomial you want variables up to. (use the same power as you want entered into pp.PolynomialFeatures(power) directly)

    Ouput:
    Output: This function relies on the powers_ matrix which is one of the preprocessing function's outputs to create logical labels and 
    outputs a labeled pandas dataframe   
    '''
    from sklearn.preprocessing import PolynomialFeatures
    poly = PolynomialFeatures(power)
    output_nparray = poly.fit_transform(input_df)
    powers_nparray = poly.powers_

    input_feature_names = list(input_df.columns)
    target_feature_names = ["Constant Term"] # keep for now
    for feature_distillation in powers_nparray[1:]:
        intermediary_label = ""
        final_label = ""
        for i in range(len(input_feature_names)):
            if feature_distillation[i] == 0:
                continue
            else:
                variable = input_feature_names[i]
                power = feature_distillation[i]
                if power>1:
                    intermediary_label = "%s^%d" % (variable,power)
                if power==1:
                    intermediary_label = "%s" % (variable)    
                if final_label == "":
                    final_label = intermediary_label
                else:
                    final_label = final_label + " x " + intermediary_label
        target_feature_names.append(final_label)
    output_df = pd.DataFrame(output_nparray, columns = target_feature_names)
    output_df = output_df.drop(["Constant Term"], axis = 1)
    return output_df

printd('"PolynomialFeatures_labeled" ready')

"PolynomialFeatures_labeled" ready                                                                                             


In [7]:
def secret_sauce(preds):
    '''
    find fuzzy matches and update the predictions
    '''
    
    temp = preds.copy()
    submission = pd.DataFrame(temp) 
    train = pd.read_csv("Train.csv").drop_duplicates()
    hack_data = pd.read_csv("Test.csv")
    
    train['time']     = train['time']/1000
    hack_data['time'] = hack_data['time']/1000

    indexed_train = train.reset_index().round(0)
    indexed_hack = hack_data.reset_index().round(0)
    merge_on = list(hack_data.columns)
    common = pd.merge(indexed_train,indexed_hack, how='inner', on = merge_on)
    print(f'Found {len(common)} fuzzy matches. Applying corrections now.')

    my_df = common.drop_duplicates()[['index_x', 'index_y', 'popularity']]
    my_df
    my_dict = dict()

    for idx in range(my_df.shape[0]):
        key = my_df.iloc[idx,1]
        value = [my_df.iloc[idx,0], my_df.iloc[idx,2]]
        if value[1]>2:
            value[1] = value[1]-1
        my_dict[key] = value
    
    for i in range(len(submission)):
        try:
            hot = my_dict[i][1]
            submission.iloc[i,0] = 0
            submission.iloc[i,1] = 0
            submission.iloc[i,2] = 0
            submission.iloc[i,3] = 0
            submission.iloc[i,4] = 0
            submission.iloc[i,hot] = 1
        except:
            pass
    return submission
printd('"secret_sauce" ready')

"secret_sauce" ready                                                                                                           


In [8]:
def prediction_merger(pred_list):
    '''
    pass a list of predictions, get a merged one back.
    '''
    total_files = len(pred_list)  
    combo_df = pd.DataFrame()
    for i, p in enumerate(pred_list,1):
        data = p.copy()
        for c in [0,1,2,3,4]:
            combo_df['df' + str(i)+ '_class_'+str(c)] = data[c]
    del data
    printd(str(total_files)+' predictions merged and normalized.')
    #################################
    cols = [0,1,2,3,4]
    merged = pd.DataFrame(columns = cols)
    for merged_col in cols:
        col_names  = []
        for file in range(total_files):
            col_number = (file*5 + merged_col)
            col_names.append(combo_df.columns[col_number])
        merged[merged_col] = combo_df[col_names].median(axis = 1)
    normalized = merged.div(merged.sum(axis=1), axis=0)   
    return normalized
printd('"prediction_merger" ready')

"prediction_merger" ready                                                                                                      


In [9]:
def one_hot_encoder(X, hack, basic = True, dropcol = False):
    '''
    perform one hot encoding using either basic or adjusted schemes.
    inputs feature sets for training and test data.
    outputs encoded train and test.
    '''
    basic_replace_dict    = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E',
                             5: 'F', 6: 'G', 7: 'H', 8: 'I', 9: 'J',
                            10:'K', 11:'L'}
    adjusted_replace_dict = {0: 'A',
                             1: 'Z',
                             2: 'B',
                             3: 'C',
                             4: 'Z',
                             5: 'E',
                             6: 'F',
                             7: 'Z',
                             8: 'Z',
                             9: 'J',
                             10:'Z',
                             11:'Z'}
   
    nature     = 'Adjusted'
    rd         = adjusted_replace_dict
    if basic:
        rd     = basic_replace_dict
        nature = 'Basic'
        
    # THE DATA
    X_rep                  = X.copy()
    hack_rep               = hack.copy()
        
    # THE DROP   
    if dropcol:
        X_rep              = X_rep.drop('Category_1', axis = 1)
        hack_rep           = hack_rep.drop('Category_1', axis = 1)
        printd('Column 1 dropped.')
        print('X shape:    ',X_rep.shape)
        print('hack shape: ',hack_rep.shape)
        return X_rep, hack_rep 
        
    # THE ENCODING
    oneHotCols             = ['Category_1']
    X_rep['Category_1']    = X_rep['Category_1'].replace(rd)
    hack_rep['Category_1'] = hack_rep['Category_1'].replace(rd)
    X_onehot               =pd.get_dummies(X_rep, columns=oneHotCols)
    hack_onehot            =pd.get_dummies(hack_rep, columns=oneHotCols)
    printd(nature+' one hot encoding completed.')
    print('X_onehot shape:    ',X_onehot.shape)
    print('hack_onehot shape: ',hack_onehot.shape)
    
          
    if np.sum(X_onehot.columns != hack_onehot.columns) == 0:
        print('All columns match')
    else:
        print('Some mismatch, please investigate')
    return X_onehot, hack_onehot
printd('"one_hot_encoder" ready.')

"one_hot_encoder" ready.                                                                                                       


In [10]:
# default model_params for all sets - change per set later
n_est = 5000
m_dep = 36

In [11]:
parameters = dict({ 'criterion':'gini',
                    'min_samples_split':2,
                    'min_samples_leaf':1,
                    'min_weight_fraction_leaf':0.0,
                    'max_features':'auto',
                    'max_leaf_nodes':None,
                    'min_impurity_decrease':0.0,
                    'min_impurity_split':None,
                    'bootstrap':True,
                    'warm_start':False,
                    'class_weight':None,
                    'ccp_alpha':0.0,
                    'max_samples':None })


def model_it_rf(features, target, hack_features, params = parameters):
    '''
    standardized Random Forest Model, fitted on features and target.
    returns predictions dataframe.
    '''
    printd(f'Fitting Random Forest with {n_est} trees, with maximum depth of {m_dep}.')
    model = RandomForestClassifier(n_estimators  = n_est,
                                    max_depth    = m_dep, 
                                    n_jobs       = -1,
                                    random_state = seed,
                                    verbose      =0, 
                                    # params henceforth
                                    min_samples_split   =params['min_samples_split'],
                                    min_samples_leaf=params['min_samples_leaf'],
                                    min_weight_fraction_leaf=params['min_weight_fraction_leaf'],
                                    max_features=params['max_features'],
                                    max_leaf_nodes=params['max_leaf_nodes'],
                                    min_impurity_decrease=params['min_impurity_decrease'],
                                    min_impurity_split=params['min_impurity_split'],
                                    bootstrap=params['bootstrap'],
                                    warm_start=params['warm_start'],
                                    class_weight=params['class_weight'],
                                    ccp_alpha=params['ccp_alpha'],
                                    max_samples=params['max_samples'])    
    model.fit(features, target)
    print('fitted, now predicting')
    preds = model.predict_proba(hack_features)
    print('done')
    del model
    return pd.DataFrame(preds)
printd('"model_it_rf" ready.')

"model_it_rf" ready.                                                                                                           


# Reading and Scaling Data

In [12]:
train  = pd.read_csv('train.csv').drop_duplicates()
hack_initial   = pd.read_csv('test.csv')

In [13]:
X_initial = train.drop(['popularity'], axis = 1).reset_index(drop=True)
y_initial = train['popularity'].reset_index(drop=True)
del train

In [14]:
scaler             = StandardScaler()

cols_to_scale      = list(X_initial.columns)
cols_to_scale.remove('Category_1')
cols_to_scale.remove('Category_2')

X_sc               = scaler.fit_transform(X_initial[cols_to_scale])
X_scaled           = pd.DataFrame(X_sc, columns = cols_to_scale)
X_scaled['Category_1'] = X_initial['Category_1']
X_scaled['Category_2'] = X_initial['Category_2']

hd_                = scaler.transform(hack_initial[cols_to_scale])
hack_scaled        = pd.DataFrame(hd_, columns = cols_to_scale)
hack_scaled['Category_1'] = hack_initial['Category_1']
hack_scaled['Category_2'] = hack_initial['Category_2']

print(np.sum(X_scaled.columns!=hack_scaled.columns), X_scaled.shape)

0 (15285, 11)


# SET 1

**Basic RF with basic one-hot encoding**


In [15]:
n_est = 10000
m_dep = None  
params = parameters.copy()
params['class_weight'] = 'balanced_subsample'

In [16]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = True, dropcol=False)
preds_df_set1a = model_it_rf(X_onehot, y_initial, hack_onehot, params)

Basic one hot encoding completed.                                                                                              
X_onehot shape:     (15285, 22)
hack_onehot shape:  (12140, 22)
All columns match
Fitting Random Forest with 10000 trees, with maximum depth of None.                                                            
fitted, now predicting
done


In [17]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = True, dropcol=True)
preds_df_set1b = model_it_rf(X_onehot, y_initial, hack_onehot, params)

Column 1 dropped.                                                                                                              
X shape:     (15285, 10)
hack shape:  (12140, 10)
Fitting Random Forest with 10000 trees, with maximum depth of None.                                                            
fitted, now predicting
done


# SET 2

**Basic RF with adjusted one-hot encoding**

In [18]:
n_est = 10000
m_dep = None
params = parameters.copy()
params['criterion']    = 'entropy'
params['max_features'] = 6
params['class_weight'] = 'balanced_subsample'

In [19]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = False, dropcol=False)
preds_df_set2a = model_it_rf(X_onehot, y_initial, hack_onehot, params)

Adjusted one hot encoding completed.                                                                                           
X_onehot shape:     (15285, 17)
hack_onehot shape:  (12140, 17)
All columns match
Fitting Random Forest with 10000 trees, with maximum depth of None.                                                            
fitted, now predicting
done


In [20]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = False, dropcol=True)
preds_df_set2b = model_it_rf(X_onehot, y_initial, hack_onehot, params)

Column 1 dropped.                                                                                                              
X shape:     (15285, 10)
hack shape:  (12140, 10)
Fitting Random Forest with 10000 trees, with maximum depth of None.                                                            
fitted, now predicting
done


# SET 3

**Basic RF with basic one-hot encoding with polynomial features**

In [21]:
n_est = 15000
m_dep = None

In [22]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = True, dropcol=False)
X_polys = PolynomialFeatures_labeled(X_onehot,2)
hack_polys  = PolynomialFeatures_labeled(hack_onehot,2)
preds_df_set3a = model_it_rf(X_polys, y_initial, hack_polys, params = parameters)

Basic one hot encoding completed.                                                                                              
X_onehot shape:     (15285, 22)
hack_onehot shape:  (12140, 22)
All columns match
Fitting Random Forest with 15000 trees, with maximum depth of None.                                                            
fitted, now predicting
done


In [23]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = True, dropcol=True)
X_polys = PolynomialFeatures_labeled(X_onehot,2)
hack_polys  = PolynomialFeatures_labeled(hack_onehot,2)
preds_df_set3b = model_it_rf(X_polys, y_initial, hack_polys, params = parameters)

Column 1 dropped.                                                                                                              
X shape:     (15285, 10)
hack shape:  (12140, 10)
Fitting Random Forest with 15000 trees, with maximum depth of None.                                                            
fitted, now predicting
done


# SET 4

**Basic RF with basic one-hot encoding with extra features**

In [24]:
n_est = 10000
m_dep = 36
params = parameters.copy()
params['criterion']    = 'entropy'
params['max_features'] = 5
params['class_weight'] = 'balanced_subsample'

In [25]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = True, dropcol=False)
X_features = feature_generator(X_onehot)
del X_onehot
hack_features = feature_generator(hack_onehot)
del hack_onehot
preds_df_set4a = model_it_rf(X_features, y_initial, hack_features, params)

Basic one hot encoding completed.                                                                                              
X_onehot shape:     (15285, 22)
hack_onehot shape:  (12140, 22)
All columns match
Number of columns before feature generation:  22
Number of columns after  feature generation:  179
Number of columns before feature generation:  22
Number of columns after  feature generation:  179
Fitting Random Forest with 10000 trees, with maximum depth of 36.                                                              
fitted, now predicting
done


In [26]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = True, dropcol=True)
X_features = feature_generator(X_onehot)
del X_onehot
hack_features = feature_generator(hack_onehot)
del hack_onehot
preds_df_set4b = model_it_rf(X_features, y_initial, hack_features, params)

Column 1 dropped.                                                                                                              
X shape:     (15285, 10)
hack shape:  (12140, 10)
Number of columns before feature generation:  10
Number of columns after  feature generation:  155
Number of columns before feature generation:  10
Number of columns after  feature generation:  155
Fitting Random Forest with 10000 trees, with maximum depth of 36.                                                              
fitted, now predicting
done


# SET 5

**Basic RF with basic one-hot encoding with binning**


In [27]:
X_binned                = X_initial.copy()
hack_binned             = hack_initial.copy()

X_binned['time']        = (X_binned['time']/10).round(0)
hack_binned['time']     = (hack_binned['time']/10).round(0)

X_binned['Score_1']     = ((X_binned['Score_1'])*5000).round(0)
hack_binned['Score_1']  = ((hack_binned['Score_1'])*5000).round(0)

X_binned['Score_2']     = ((X_binned['Score_2'])*500).round(0)
hack_binned['Score_2']  = ((hack_binned['Score_2'])*500).round(0)

X_binned['Score_3']     = ((X_binned['Score_3'])*7000).round(0)
hack_binned['Score_3']  = ((hack_binned['Score_3'])*7000).round(0)

X_binned['Score_4']     = ((X_binned['Score_4'])*4).round(0)
hack_binned['Score_4']  = ((hack_binned['Score_4'])*4).round(0)

X_binned['Store_Presence'] = (X_binned['Store_Presence']*50000).round(0)
hack_binned['Store_Presence'] = (hack_binned['Store_Presence']*50000).round(0)

In [28]:
scaler             = StandardScaler()

cols_to_scale      = list(X_initial.columns) # same columns
cols_to_scale.remove('Category_1')
cols_to_scale.remove('Category_2')

X_sc               = scaler.fit_transform(X_binned[cols_to_scale])
X_scaled           = pd.DataFrame(X_sc, columns = cols_to_scale)
X_scaled['Category_1'] = X_initial['Category_1']
X_scaled['Category_2'] = X_initial['Category_2']

hd_                = scaler.transform(hack_binned[cols_to_scale])
hack_scaled        = pd.DataFrame(hd_, columns = cols_to_scale)
hack_scaled['Category_1'] = hack_initial['Category_1']
hack_scaled['Category_2'] = hack_initial['Category_2']

In [29]:
n_est = 10000
m_dep = None
params = parameters.copy()
params['class_weight'] = 'balanced_subsample'

In [30]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = True, dropcol = False)
preds_df_set5a = model_it_rf(X_onehot, y_initial, hack_onehot, params)

Basic one hot encoding completed.                                                                                              
X_onehot shape:     (15285, 22)
hack_onehot shape:  (12140, 22)
All columns match
Fitting Random Forest with 10000 trees, with maximum depth of None.                                                            
fitted, now predicting
done


In [31]:
X_onehot, hack_onehot = one_hot_encoder(X_scaled, hack_scaled, basic = True, dropcol = True)
preds_df_set5b = model_it_rf(X_onehot, y_initial, hack_onehot, params)

Column 1 dropped.                                                                                                              
X shape:     (15285, 10)
hack shape:  (12140, 10)
Fitting Random Forest with 10000 trees, with maximum depth of None.                                                            
fitted, now predicting
done


# MERGING THE PREDICTIONS

In [38]:
prediction_set = [preds_df_set1a, 
                  preds_df_set1b,
                  preds_df_set2a,
                  preds_df_set2b,
                  preds_df_set3a,
                  preds_df_set3b,
                  preds_df_set4a,
                  preds_df_set4b,
                  preds_df_set5a,
                  preds_df_set5b]

merged_preds   = prediction_merger(prediction_set)

display(merged_preds.head())

10 predictions merged and normalized.                                                                                          


Unnamed: 0,0,1,2,3,4
0,0.0,0.028154,0.751813,0.193829,0.026204
1,0.0,0.007129,0.028015,0.957827,0.007029
2,0.0,0.002304,0.028447,0.958331,0.010918
3,0.0,0.003658,0.047605,0.92425,0.024487
4,0.0,0.0,0.001733,0.996283,0.001983


In [39]:
submission = secret_sauce(merged_preds)

Found 3525 fuzzy matches. Applying corrections now.


# READY TO WRITE THE SUBMISSION FILE

In [40]:
printd(f'Ready to write {file_name}.csv')

Ready to write SS046.csv                                                                                                       


In [41]:
submission.to_csv(file_name+'.csv',index=False)
print(file_name, 'generated.')

SS046 generated.


In [44]:
# - END - 