<b>Data mining project - 2020/21</b><br>
<b>Authors</b>: [Alexandra Bradan](https://github.com/alexandrabradan), [Alice Graziani](https://github.com/alicegraziani25) and [Eleonora Cocciu](https://github.com/eleonoracocciu)<br>
<b>Python version</b>: 3.x<br>
<b>Last update: 21/05/2021<b>

In [8]:
# system library
import os
import sys
import json
from tqdm.notebook import tqdm

# useful libraries
import math
import operator
import itertools
import statistics
import collections
from collections import Counter
from collections import OrderedDict

# pandas
import pandas as pd

# numpy
import numpy as np
from numpy import std
from numpy import mean
from numpy import percentile

# visualisarion
import pydotplus
import seaborn as sns
from matplotlib import colors
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from IPython.display import Image

# sklearn
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import classification_report, confusion_matrix

# dimensional reducers
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif  # classification
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression  # regression

# scalers
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MaxAbsScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import KBinsDiscretizer

# performance visualisation 
from sklearn import tree
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall
from scikitplot.metrics import plot_cumulative_gain
from scikitplot.metrics import plot_lift_curve
from sklearn.model_selection import learning_curve
from mlxtend.plotting import plot_decision_regions
from yellowbrick.model_selection import LearningCurve

# tree classifiers
from sklearn.tree import DecisionTreeClassifier

# linear classifiers
from sklearn.linear_model import LassoCV
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression

# neighbors classifiers
from sklearn.neighbors import KNeighborsClassifier

# naive_bayes classifiers
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# ensemble classifiers
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

# svm
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

# based_ruled
import Orange
from Orange.data import *
# import wittgenstein as lw

plt.rcParams["patch.force_edgecolor"] = True
%matplotlib inline

from yellowbrick.style import set_palette
set_palette('bold')

<h6> Global parameters </h6>

In [9]:
n_iter = 10
scoring = 'f1_weighted'
random_state = 42

# test_n_splits = 9
test_n_splits = 3

cn2_learner = Orange.classification.rules.CN2Learner() 
model_name = "CN2"  

learning_curve_flag = False
v_or_t_flag = "TST"
cmap = plt.cm.spring_r
color = "fuchsia"

<h6> Datasets loading </h6>

In [10]:
X_train = pd.read_csv('../../data/fma_metadata/X_train_merged.csv', index_col=0)
X_test = pd.read_csv('../../data/fma_metadata/X_test.csv', index_col=0)

y_train = pd.read_csv('../../data/fma_metadata/y_train_merged.csv', index_col=0)
y_test = pd.read_csv('../../data/fma_metadata/y_test.csv', index_col=0)

print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

X_train_vt = pd.read_csv('../../data/fma_metadata/X_train_merged_vt.csv', index_col=0)
X_test_vt = pd.read_csv('../../data/fma_metadata/X_test_vt.csv', index_col=0)

print(X_train_vt.shape, X_test_vt.shape)
print(y_train.shape, y_test.shape)

(92834, 55) (10874, 55)
(92834, 1) (10874, 1)
(92834, 28) (10874, 28)
(92834, 1) (10874, 1)


<h6>Continous, categorical/ordinal column retrieval</h6>

In [11]:
numeric_columns = []  # continous variables
for column_name in X_train.columns:
    if ("track_genre_top" not in column_name) and  \
          ("track_date_created_year" not in column_name) and \
            ("track_date_created_season" not in column_name):
                numeric_columns.append(column_name)
print("numeric_columns", len(numeric_columns))

numeric_columns 37


In [12]:
categoric_columns = []  # ordinal or categorical variables
for column_name in X_train.columns:
    if ("track_genre_top" in column_name) or  \
          ("track_date_created_year" in column_name) or \
            ("track_date_created_season" in column_name):
                categoric_columns.append(column_name)
print("categoric_columns", len(categoric_columns))

categoric_columns 18


In [13]:
numeric_columns_vt = list(set(numeric_columns).intersection(set(X_train_vt.columns)))
print("numeric_columns_vt", len(numeric_columns_vt))

numeric_columns_vt 15


In [14]:
categoric_columns_vt = list(set(categoric_columns).intersection(set(X_train_vt.columns)))
print("categoric_columns_vt", len(categoric_columns_vt))

categoric_columns_vt 13


<h6> Discretize training and test</h6>

In [15]:
X_tr = X_train.copy()
y_tr = y_train.copy()
X_ts = X_test.copy()
y_ts = y_test.copy()

X_tr_vt = X_train_vt.copy()
X_ts_vt = X_test_vt.copy()

print(X_tr.shape, X_ts.shape, X_tr_vt.shape, X_ts_vt.shape)
print(y_tr.shape, y_ts.shape)

(92834, 55) (10874, 55) (92834, 28) (10874, 28)
(92834, 1) (10874, 1)


In [16]:
# reunite together track_genre_top and make it ordinal
"""genres_columns = [x for x in X_train.columns if "genre_top" in x]

i = 1
genres_map = {}
for column_name in genres_columns:
    genres_map[column_name] = i
    i += 1
    
ordinal_genre_top_column_tr = []
for row_idx in X_train.index:
    for column_name in genres_columns:
        if X_train.at[row_idx, column_name] == 1:
            ordinal_genre_top_column_tr.append(genres_map[column_name])
            
ordinal_genre_top_column_ts = []
for row_idx in X_test.index:
    for column_name in genres_columns:
        if X_test.at[row_idx, column_name] == 1:
            ordinal_genre_top_column_ts.append(genres_map[column_name])
            
if ((len(ordinal_genre_top_column_tr) == X_train.shape[0]) == False) or \
        ((len(ordinal_genre_top_column_ts) == X_test.shape[0]) == False):
    print(len(ordinal_genre_top_column_tr), X_train.shape[0])
    print(len(ordinal_genre_top_column_ts), X_test.shape[0])
    sys.exit(-1)
    
# removing onehotencoded columns and inserting new, ordinal one
genres_columns_indeces = []
for column_name in genres_columns:
    idx = X_train.columns.get_loc(column_name)
    genres_columns_indeces.append(idx)
    ()
for column_name in genres_columns:
    del X_tr[column_name]
    del X_ts[column_name]
X_tr.insert(genres_columns_indeces[0], 'track_genre_top', ordinal_genre_top_column_tr)
X_ts.insert(genres_columns_indeces[0], 'track_genre_top', ordinal_genre_top_column_ts)

print(X_tr.shape, X_ts.shape)
print(y_tr.shape, y_ts.shape)"""

'genres_columns = [x for x in X_train.columns if "genre_top" in x]\n\ni = 1\ngenres_map = {}\nfor column_name in genres_columns:\n    genres_map[column_name] = i\n    i += 1\n    \nordinal_genre_top_column_tr = []\nfor row_idx in X_train.index:\n    for column_name in genres_columns:\n        if X_train.at[row_idx, column_name] == 1:\n            ordinal_genre_top_column_tr.append(genres_map[column_name])\n            \nordinal_genre_top_column_ts = []\nfor row_idx in X_test.index:\n    for column_name in genres_columns:\n        if X_test.at[row_idx, column_name] == 1:\n            ordinal_genre_top_column_ts.append(genres_map[column_name])\n            \nif ((len(ordinal_genre_top_column_tr) == X_train.shape[0]) == False) or         ((len(ordinal_genre_top_column_ts) == X_test.shape[0]) == False):\n    print(len(ordinal_genre_top_column_tr), X_train.shape[0])\n    print(len(ordinal_genre_top_column_ts), X_test.shape[0])\n    sys.exit(-1)\n    \n# removing onehotencoded columns and in

In [17]:
# Place the DataFrames side by side
X_tr = pd.concat([X_tr, y_tr], axis=1)
X_ts = pd.concat([X_ts, y_ts], axis=1)
print(X_tr.shape, X_ts.shape)

(92834, 56) (10874, 56)


In [18]:
def print_performed_encoding(column_name, train_encoded):
    encoding_info = {}
    for enc, i in zip(train_encoded, X_tr.index):
        try:
            tmp_list = encoding_info[str(enc)]
            tmp_list.append(X_tr.at[i, column_name])
            encoding_info[str(enc)] = tmp_list
        except KeyError:
            encoding_info[str(enc)] = [X_tr.at[i, column_name]]
            
    for key, value in encoding_info.items():
        min_value = min(value)
        max_value = max(value)
        print(column_name, key, "[%s-%s]" %(min_value, max_value),sep="\t")

In [19]:
# uniform 10 bins discretisation
best_k = 10
for column_name in numeric_columns:
    discretizer = KBinsDiscretizer(n_bins=best_k, encode='ordinal', strategy='uniform')
    train_encoded = discretizer.fit_transform(X_train[[column_name]]).astype(int)
    test_encoded = discretizer.transform(X_test[[column_name]]).astype(int)
    X_tr[column_name] = train_encoded
    X_ts[column_name] = test_encoded
    # print_performed_encoding(column_name, train_encoded)
    
# check
for column_name in numeric_columns:
    if len(X_tr[column_name].unique()) > best_k or len(X_ts[column_name].unique()) > best_k:
        print(column_name)

<h6>Load Orange table</h6>

In [20]:
domain_list = []
class_vars = None
for column_name in X_tr.columns:
    values = []
    for elem in X_tr[column_name].unique():
        values.append(column_name + "=" + str(elem))
    var_instance = Orange.data.DiscreteVariable(name=column_name, values=tuple(values))
    if column_name == "album_type":
        class_vars = var_instance
    else:
         domain_list.append(var_instance)
domain = Orange.data.Domain(domain_list, class_vars=class_vars)
X_tr_table = Orange.data.Table(domain, X_tr.values)
X_ts_table = Orange.data.Table(domain, X_ts.values)

<h2> CN2Learner </h2>

Induction of rules works by finding a rule that covers some learning instances, removing these instances, and repeating this until all instances are covered. Rules are scored by heuristics such as impurity of class distribution of covered instances. The module includes common rule-learning algorithms, and allows for replacing rule search strategies, scoring and other components.

**Default parameters**:


**Tuned parameters**:


In [21]:
# PLOT FUNCTIONS
def plot_roc_curve(y_ts, y_prob):
    plt.figure(figsize=(8, 5))
    plot_roc(y_ts, y_prob)
    plt.title("%s\'s %s ROC curve" % (model_name, v_or_t_flag.upper()))
    plt.show()
    
def plot_precision_recall_curve(y_ts, y_prob):
    plt.figure(figsize=(8, 5))
    plot_precision_recall(y_ts, y_prob)
    plt.title("%s\'s %s Precision-Recall curve" % (model_name, v_or_t_flag.upper()))
    plt.show()
    
def plot_cumulative_gain_curve(y_ts, y_prob):
    plt.figure(figsize=(8, 5))
    plot_cumulative_gain(y_ts, y_prob)
    plt.title("%s\'s %s Cumulative Gains curve" % (model_name, v_or_t_flag.upper()))
    plt.show()
    
def plot_lift_curve_curve(y_ts, y_prob):
    plt.figure(figsize=(8, 5))
    plot_lift_curve(y_ts, y_prob)
    plt.title("%s\'s %s Lift curve" % (model_name, v_or_t_flag.upper()))
    plt.show()
    
def plot_confusion_matrix(cm, classes, normalize):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title("%s\'s %s classification report" % (model_name, v_or_t_flag.upper()))
    # plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)
    
    plt.grid(False)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()
    
def plot_classification_report(y_ts, y_pred):
    model_report = classification_report(y_ts, 
                                       y_pred,
                                       # target_names=['Studio Recording', 'Live Recording'],
                                       output_dict=True)
    model_r = pd.DataFrame(model_report).iloc[:-1, :].T
    sns.heatmap(model_r, annot=True, cmap=cmap, cbar=False)
    plt.title("%s\'s %s confusion matrix" % (model_name, v_or_t_flag.upper()))
    plt.show()
    
def plot_decision_boundary(X_tr, y_tr, scaler, model):
    try:
        pca = PCA(n_components=2)
        best_visualisation_scaler = StandardScaler()
        scaled_X_tr = best_visualisation_scaler.fit_transform(X_tr)
        X = pca.fit_transform(scaled_X_tr)
        # X = pca.fit_transform(X_tr)
        y = y_tr.values.ravel()

        model.fit(X, y)
        plt.figure(figsize=(8, 5))
        fig = plot_decision_regions(X=X, y=y, clf=model, legend=2)
        plt.xlabel("PCA component 1")
        plt.ylabel("PCA component 2")
        plt.title("%s's %s decision boundary" % (model_name, v_or_t_flag))
        plt.legend(loc='best')
        plt.grid(False)
        plt.show()
    except ValueError:
        return

def spot_errors(test_label, test_pred):  
    spot_errors = []
    label_errors = []
    for i in range(len(test_label)):
        if test_label[i] != test_pred[i]:
            spot_errors.append('darkred')
            label_errors.append("wrong prediction")
        else:
            spot_errors.append('darkgray')
            label_errors.append("correct prediction")
    return spot_errors, label_errors

def classification_visualizer(test_set, test_label, test_pred):
    test_label = test_label.values
    
    f, axs = plt.subplots(nrows=1, ncols=3, figsize=(24,8))
    errors, label_errors = spot_errors(test_label, test_pred)
    labels = [test_label, test_pred, errors]
    titles = ['True Labels', 'Predicted Labels', 'Misclassifications']
    
    for i in range(0, 3):
        axs[i].scatter(test_set[missclassif_column_name1], test_set[missclassif_column_name2], c=labels[i], cmap='cividis')
        axs[i].set_title(titles[i])
        axs[i].set_xlabel(missclassif_column_name1, fontdict={'fontsize': 'large'})
        axs[i].set_ylabel(missclassif_column_name2, fontdict={'fontsize': 'large'})
        
    plt.suptitle('Visualization of the ' + model_name + ' classifier on the %s' % v_or_t_flag)
    plt.show()
    

def error_visualizer(test_set, test_label, test_pred, column_name1, column_name2):
    test_label = test_label.values
    errors, label_errors = spot_errors(test_label, test_pred)
    
    palette = ['darkgray', 'darkred']
    if errors[0] == 'darkred':
        palette = ['darkred', 'darkgray']
    
    fig = plt.figure(figsize=(5, 6))
    sns.scatterplot(x=test_set[column_name1], y=test_set[column_name2], hue=label_errors, palette=palette)
    plt.title('%s\'s %s misclassifications' % (model_name, v_or_t_flag))
    plt.xlabel(column_name1)
    plt.ylabel(column_name2)

    plt.legend()
    plt.show()
    
def plot_learning_curve(X_tr, y_tr, model, v_or_t_flag):
    second_score = ""
    if v_or_t_flag == 'VAL':
        cv = StratifiedKFold(n_splits=validation_n_splits)
        second_score = "Validation score"
    else:
        cv = StratifiedKFold(n_splits=test_n_splits)
        second_score = "Test score"
    sizes = np.linspace(0.3, 1.0, 10)
    plt.figure(figsize=(8, 5))
    visualizer = LearningCurve(model, cv=cv, scoring=scoring, train_sizes=sizes, 
                                                                        random_state=random_state)

    visualizer.fit(X_tr, y_tr.values.ravel())    
    # visualizer.ax.get_lines()[1].set_label(second_score)
    visualizer.show() 

In [22]:
def get_best_features_grid_cv(X_tr, y_tr, results, key):
    
    if key == 'anova':
        best_k = results.best_params_['anova__k']
        select_k_best = SelectKBest(score_func=f_classif, k=best_k)
        fit = select_k_best.fit(X_tr, y_tr.values.ravel())
        df_scores = pd.DataFrame(fit.scores_)
        df_columns = pd.DataFrame(X_tr.columns)
    elif key == 'rfe':
        best_k = results.best_params_['rfe__n_features_to_select']
        estimator = results.best_params_['rfe__estimator']
        select_rfe = RFE(estimator=estimator, n_features_to_select=best_k)  # best_k=estimator
        fit = select_rfe.fit(X_tr, y_tr.values.ravel())
        df_scores = pd.DataFrame(fit.ranking_)
    else:
        print("wrong key=%s" % key)
        sys.exit(-1)
  
    df_columns = pd.DataFrame(X_tr.columns)
    feature_scores = pd.concat([df_columns, df_scores],axis=1) # concatenate dataframes
    feature_scores.columns = ['features','scores']  # name output columns
    feature_scores = feature_scores[feature_scores['scores'] != 0]  # keeping only non-zero scoring features

    # plot feature importance
    fig = plt.figure(figsize=(8, 10))
    # keeping ongly best_k features, ordered in descending score
    ordered_k_feature_scores = feature_scores.sort_values('scores', ascending=False).iloc[:best_k]
    sns.barplot(y='features', x='scores', data=feature_scores, color=color,
                                                            order=ordered_k_feature_scores.features)
    plt.grid(False)
    plt.title("%s's %s feature importance using %s" % (model_name, v_or_t_flag,  key.upper()))
    plt.show()

    # retrieve best features 
    # best_features = [column[0] for column in zip(X_tr.columns, select_k_best.get_support()) if column[1]]
    best_features = list(ordered_k_feature_scores.features)
    best_features_scores = list(ordered_k_feature_scores.scores)

    return best_features, best_features_scores

In [23]:
def get_feature_importances_or_coef(X_tr, y_tr, tuned_model):

    try:
        df_scores = pd.DataFrame(tuned_model.feature_importances_)
        best_k = tuned_model.n_features_
    except:
        try:
            df_scores = pd.DataFrame(tuned_model.coef_)

            best_k = len([x for x in list(df_scores.values)])
        except:
            print("Wrong curr_model's retrieval feature importance")
            sys.exit(-1)

    df_columns = pd.DataFrame(X_tr.columns)   
    feature_scores = pd.concat([df_columns, df_scores], axis=1) # concatenate dataframes
    feature_scores.columns = ['features','scores']  # name output columns
    feature_scores = feature_scores[feature_scores['scores'] != 0]  # keeping only non-zero scoring features

    # plot feature importance
    fig = plt.figure(figsize=(8, 10))
    # keeping ongly best_k features, ordered in descending score
    ordered_k_feature_scores = feature_scores.sort_values('scores', ascending=False).iloc[:best_k]
    sns.barplot(y='features', x='scores', data=feature_scores, color=color,
                                                            order=ordered_k_feature_scores.features)
    plt.grid(False)
    plt.title("%s's %s feature importance" % (model_name, v_or_t_flag))
    plt.show()

    # retrieve best features 
    # best_features = [column[0] for column in zip(X_tr.columns, select_k_best.get_support()) if column[1]]
    best_features = list(ordered_k_feature_scores.features)
    best_features_scores = list(ordered_k_feature_scores.scores)


    return best_features, best_features_scores

In [24]:
def replace_categorical_feature_with_dummy_ones(df, column_name, categories_list, dummy_features):
    """
    Function which replaces the nominal feature passed by argument with dummy ones, 
    to convert nominal column's M values in M new binary (dummy) features.
    """
    # retrive nominal feature's index. It is used to know where to insert the new M binary features
    index = df.columns.get_loc(column_name)
    for i in range(0, dummy_features.shape[1]):
        index += 1
        df.insert(index, column_name + "_" + str(categories_list[i]), 
                                                              dummy_features[:, i].todense().astype(int), True)
    # remove categorical feature
    del df[column_name]
    
    return df

In [25]:
def get_tuned_model(X_tr, y_tr, params, numeric_features):
    
    X_tr_curr = X_tr.copy()
    
    model_params = list(model_grid(X_tr_curr).keys())
    try:
        n_bins = params['preprocessor__numeric__discretizer__n_bins'] 
        strategy = params['preprocessor__numeric__discretizer__strategy']
        encode = params['preprocessor__numeric__discretizer__encode']
        discretizer = KBinsDiscretizer(encode=encode, n_bins=n_bins, strategy=strategy)
        # scale data
        X_tr_curr = discretizer.fit_transform(X_tr_curr.values)
        
        # need to onehot continous and ordinal features
        ordinal_columns = ['track_date_created_season', 'track_date_created_year'] 
        numeric_features = numeric_features + ordinal_columns
        for column_name in numeric_features:
            try:
                categories_list = sorted(list(X_tr_curr[column_name].unique()))
                encoder = OneHotEncoder(categories=[categories_list])   
                dummy_features = encoder.fit_transform(X_tr_curr[column_name].values.reshape(-1,1))  
                X_tr_curr = replace_categorical_feature_with_dummy_ones(X_tr_curr, column_name, categories_list, dummy_features)
            except:
                continue
    except KeyError:
        pass


    try:
        scaler = params['preprocessor__numeric__discretizer__scaler']
        # scale data
        X_tr_curr = scaler.fit_transform(X_tr_curr.values)
    except KeyError:
        pass
    
    # retrieve best hyperameters
    tmp_model_hyperparameters = dict((k, params[k]) for k in model_params if k in params)
    model_hyperparameters = {}
    for key, value in tmp_model_hyperparameters.items():
        key = key.split('model__')[1].replace("'", "")
        model_hyperparameters[key] = value
        
    tuned_model =  model.set_params(**model_hyperparameters)
    tuned_model.fit(X_tr_curr, y_tr.values.ravel())
    
    plot_decision_boundary(X_tr_curr, y_tr, MinMaxScaler(), tuned_model)  # passing random scaler
    
    if learning_curve_flag:
        plot_learning_curve(curr_X_tr_vt, y_tr, tuned_model, v_or_t_flag)
    
    return tuned_model

In [26]:
def grid_search(X_tr, y_tr, X_ts, y_ts, numeric_features, categorical_features, discretizer_flag, 
                                                            scaler_flag, feature_filter_key, feature_flag):
    
    # define the evaluation method
    cv = StratifiedKFold(n_splits=test_n_splits)

    # construct the pipeline to evaluate
    # scaler = RobustScaler()
    grid=model_grid(X_tr)
    steps = [('model', model)]
        
    if feature_filter_key == 'anova':
        anova = SelectKBest(score_func=f_classif)
        steps.insert(0, ('anova', anova))
        grid['anova__k'] = [i+1 for i in range(X_tr.shape[1])]
    elif feature_filter_key == 'rfe':
        rfe = RFE(estimator=DecisionTreeClassifier())
        steps.insert(0, ('rfe', rfe))
        grid['rfe__estimator'] = [DecisionTreeClassifier(), LogisticRegression(max_iter=10000)]
        grid['rfe__n_features_to_select'] = [i+1 for i in range(X_tr.shape[1])]
    
    # construct feature type's column transformer
    numeric_steps = []
    if scaler_flag:      # continous variable normalisation/standardisation
        numeric_steps.insert(0, ('scaler', None))
        grid['preprocessor__numeric__scaler'] = [MinMaxScaler(), MaxAbsScaler(), StandardScaler(), RobustScaler()]
        
    ordinal_features = None
    ordinal_transformer = None
    if discretizer_flag:  # continous variable binning
        numeric_steps.insert(0, ('discretizer', KBinsDiscretizer(encode='ordinal')))  # ordinal bins
        grid['preprocessor__numeric__discretizer__n_bins'] = list(range(2, 11))
        grid['preprocessor__numeric__discretizer__strategy'] = ['uniform', 'quantile', 'kmeans']
        
        # onehot continous, discretized features
        numeric_steps.append(OneHotEncoder())
        
        # onehot ordinal features
        ordinal_features = ["track_date_created_year", "track_date_created_season"]
        ordinal_transformer = OneHotEncoder()                 
        
    numeric_transformer = None
    if len(numeric_steps) > 0:
        numeric_transformer = Pipeline(steps=numeric_steps)
        preprocessor = ColumnTransformer(
        transformers=[('numeric', numeric_transformer, numeric_features),
                      ('ordinal', ordinal_transformer, ordinal_features)])
        # add numeric ColumnTransformer to global Pipeline
        steps.insert(0, ('preprocessor', preprocessor))
        
    # define the pipeline to evaluate
    pipeline = Pipeline(steps=steps)
    
    # define the grid search
    # search = GridSearchCV(pipeline, grid, scoring='f1_weighted',  cv=cv, verbose=1)
    search = RandomizedSearchCV(pipeline, grid, scoring=scoring,  
                                n_iter=n_iter, cv=cv, verbose=1, refit=scoring, random_state=random_state)
    
    # perform the search
    results = search.fit(X_tr, y_tr.values.ravel())
    
    # summarize best
    score = results.best_score_
    params = results.best_params_
    print('Best Mean F1_weighted: %.3f ' % score)
    print('Best Config: %s ' % params)
    
    # perform classification (linear model doesn't predict an integer value => no predict_proba)
    y_pred = search.predict(X_ts)
    y_prob = search.predict_proba(X_ts)
    
    # performance plots
    cm = confusion_matrix(y_ts, y_pred)
    plot_confusion_matrix(cm, results.classes_, True)
    plot_classification_report(y_ts, y_pred)
    
    plot_roc_curve(y_ts, y_prob)
    plot_precision_recall_curve(y_ts, y_prob)
    plot_cumulative_gain_curve(y_ts, y_prob)
    plot_lift_curve_curve(y_ts, y_prob)
    
    best_features, best_features_scores = [], []
    if feature_filter_key != "":
        best_features, best_features_scores = get_best_features_grid_cv(X_tr, y_tr, results, feature_filter_key)
        X_tr = X_tr[best_features]
    
    # retrieve the tuned model
    # tuned_model = get_tuned_model(X_tr, y_tr, params, numeric_features)
    tuned_model = pipeline['model']
    if tuned_model !=  pipeline['model']:
        print("Difference in tuned model and pipeline")
        print("tuned_model", tuned_model)
        print("pipe", pipeline['model'])
        # print("estimator", results.estimator)
        sys.exit(-1)
    
    # plots
    if feature_flag and (feature_filter_key == ""):
        best_features, best_features_scores = get_feature_importances_or_coef(X_tr, y_tr, tuned_model)
    elif (feature_flag) and (feature_filter_key != ""):
        _ , _ = get_feature_importances_or_coef(X_tr, y_tr, tuned_model)
    
    plot_decision_boundary(X_tr, y_tr, MinMaxScaler(), tuned_model)  # passing random scaler
    plot_learning_curve(X_tr, y_tr, tuned_model)
    # error_visualizer(not_scale_X_ts, y_ts, y_pred, 'chroma_cens_02', 'track_duration')
    
    return params, tuned_model, y_pred, y_prob, best_features, best_features_scores

In [None]:
"""params_tst, tuned_model_tst, y_pred_tst, y_prob_tst, best_features, best_features_scores = \
                                                    grid_search(X_tr=X_tr, 
                                                                y_tr=y_tr,
                                                                X_ts=X_ts, 
                                                                y_ts=y_ts,
                                                                numeric_features=numeric_columns, 
                                                                categorical_features=categoric_columns, 
                                                                discretizer_flag=False,
                                                                scaler_flag=False, 
                                                                feature_filter_key="", 
                                                                feature_flag=False)""""""

Fitting 3 folds for each of 10 candidates, totalling 30 fits


In [210]:
cn2_classifier = cn2_learner(X_tr_table)

In [246]:
set_rules = cn2_classifier.rule_list
print("num_rules", len(set_rules))

num_rules 13900


In [211]:
import pickle

with open('pickle/' + model_name + '_rules.pickle', 'wb') as handle:
    pickle.dump(cn2_classifier.rule_list, handle, protocol=pickle.HIGHEST_PROTOCOL)

<h2>Load rules and perform classification</h2>

In [27]:
import pickle

with open('pickle/' + model_name + '_rules.pickle', 'rb') as handle:
    set_rules = pickle.load(handle)

In [28]:
count = 0
count_records = 0
set_rules_class_dist = []
for rule in set_rules:
    try:
        rule_record_list = rule.curr_class_dist.tolist()
        # print(rule, rule_record_list)
        set_rules_class_dist.append(rule_record_list)
        count_records += sum(rule_record_list)
        count += 1
    except:
        continue
print("REAL num_rules", count)
print("rules equal train records?", count_records == X_tr.shape[0])

REAL num_rules 13900
rules equal train records? False


In [29]:
def decompose_rule_in_antecedents_and_conseguent(rule):
    tmp_list = str(rule).replace("IF", "").split("THEN")
    antecedents_dict = {}
    tmp_antecedent = tmp_list[0].split("AND")
    for a in tmp_antecedent:
        tmp_a = a.split("==")
        if len(tmp_a) == 1:
            # tmp_a = tmp_a.split("!=")
            continue
        antecedent = tmp_a[1].replace(" ", "")
        antecedents_dict[antecedent] = None
        """tmp_a = tmp_a[1].split("=")
        antecedent = tmp_a[0]
        antecedent_value = tmp_a[1]
        antecedents_dict[antecedent] = int(antecedent_value)"""
    conseguent_value = int(tmp_list[1].replace("album_type=album_type=", ""))
    return antecedents_dict, conseguent_value

In [30]:
# keeping only positive predictive rules
positive_set_rules = []
for rule in set_rules:
    try:
        antecedents_dict, conseguent_value = decompose_rule_in_antecedents_and_conseguent(rule)
        if conseguent_value == 1:
            positive_set_rules.append(rule)
    except:
        continue
print("positive_set_rules", len(positive_set_rules))

positive_set_rules 4219


In [31]:
# filter and keep only super-set rules
removed_positive_set_rules = []
for i in tqdm(range(0, len(positive_set_rules) - 1)):
    curr_rule = positive_set_rules[i]
    try:
        curr_antecedents_dict, curr_conseguent_value = decompose_rule_in_antecedents_and_conseguent(curr_rule)
        curr_keys = set(curr_antecedents_dict.keys())
    except:
        removed_positive_set_rules.append(curr_rule)
        continue
    for j in range(i + 1, len(positive_set_rules)):
        next_rule = positive_set_rules[j]
        if next_rule not in removed_positive_set_rules:
            try:
                next_antecedents_dict, next_conseguent_value = decompose_rule_in_antecedents_and_conseguent(next_rule)
                next_keys = set(next_antecedents_dict.keys())
                
                # check if curr is sub-rule of next
                if len(next_keys.intersection(curr_keys)) == len(curr_keys):
                    # disregard curr and keep next (for now)
                    removed_positive_set_rules.append(curr_rule)
                    break
                elif len(next_keys.intersection(curr_keys)) == len(next_keys):  # check if next is sub-rule of curr
                    # disregard next and keep xurr (for now)
                    removed_positive_set_rules.append(next_rule)
            except:
                 removed_positive_set_rules.append(next_rule)
                    
super_positive_set_rules = list(set(positive_set_rules).difference(set(removed_positive_set_rules)))
print("super_positive_set_rules", len(super_positive_set_rules))
print("removed_positive_set_rules", len(removed_positive_set_rules))

HBox(children=(FloatProgress(value=0.0, max=4218.0), HTML(value='')))




TypeError: unhashable type: 'Rule'

In [None]:
import pickle

with open('pickle/' + model_name + '_super_positive_rules.pickle', 'wb') as handle:
    pickle.dump(super_positive_set_rules, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# MAJORITY VOTE
majority_vote = {}
for tqdm(idx in X_ts.index):
    ts_antecedent = []
    for column_name in X_ts.columns():
        values = X_ts.at[idx, column_name]
        ts_antecedent.append(value)
    ts_antecedent = set(ts_antecedent)
    
    # iterate over each super-set rule and perfom majority vote uppon matching rules
    majority_vote[idx] = [0, 0]
    for curr_rule in super_positive_set_rules:
        curr_antecedents_dict, curr_conseguent_value = decompose_rule_in_antecedents_and_conseguent(curr_rule)
        curr_keys = set(curr_antecedents_dict.keys())
        if len(ts_antecedent.intersection(ts_antecedent)) == len(ts_antecedent):
            majority_vote[idx][curr_conseguent_value] += 1
            
y_pred = []
for idx in tqdm(X_ts.index):
    list_counter = majority_vote[idx]
    if list_counter[0] > list_counter[1]:
        y_pred.append(0)  # predicted negative class
    elif list_counter[0] < list_counter[1]:
        y_pred.append(1)  # predicted positive class
    elif (list_counter[0] == list_counter[1]) and (list_counter[0] != 0) else (list_counter[1] != 0):
         y_pred.append(1)  # favor positive class in ties, different than no prediction
    else:
        y_pred.append(0)   # predict negative class, by deafult

In [None]:
len(y_pred) == len(y_ts)

<h2>Learning curves </h2>

This plotting is done at the end of the notebook, beacuse for some reason yellowbrick library overwrite scikitplot.

<h6>Saving best model on file</h6>

In [27]:
import pickle

model_info = {'model_name': 'VarianceThreshold',
              'params': vt_params_tst,
              'tuned_model': vt_tuned_model_tst,
              'y_pred': vt_y_pred_tst,
              'y_prob': vt_y_prob_tst,
              'best_features': vt_best_features,
              'best_features_scores': vt_best_features_scores
             }

with open('pickle/' + model_name + '.pickle', 'wb') as handle:
    pickle.dump(model_info, handle, protocol=pickle.HIGHEST_PROTOCOL)