In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn.pipeline import Pipeline
from tqdm import tqdm
import os
from os import listdir
from os.path import isfile, join
from datetime import datetime
from matplotlib.colors import ListedColormap
import pickle
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import sklearn2pmml

In [2]:
pd.options.display.float_format = '{:,.0f}'.format
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

# Import and clean data

### Create Trade Labels

In [3]:
# Create function with threshold to modify labels
def make_regression_labels(threshold, df):
    
    new_labels = np.zeros_like(df)
    new_labels[(df <= -threshold)] = -1
    new_labels[(df >=  threshold)] =  1
    new_labels[(df ==  0)] = 0

#    new_labels = np.sign(df)
#    new_labels[(df.abs() <= threshold)] = 0

    return new_labels

### Process Data

In [4]:
def get_file_list(mypath = "C:/Barak/SignalToLocal/"):
    """Get File names in a directory. must input the path name."""
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    return onlyfiles

### Load a single file of data

In [5]:
def load_data_regression(location, filename, latency_thresh):
    """Load a single file
        location = path name
        latency_thresh = 37000000"""
    
    print("Loading " + filename + " ..." )
    
    #get the date from the filename
    fulldate = filename[-12:-8] + '-' + filename[-8:-6] + '-' + filename[-6:-4]
    
    # The generated files have ---- between the headers and the data
    train = pd.read_csv(location + filename, header = 0, skiprows = [1])
    train = train.set_index('signalId');
    
    # Data cleaning
    train.dropna(inplace=True)

    # number the events within each superCluster
    train['superClusterEvent'] = train.groupby(['superClusterId']).cumcount().add(1)

    # Calculate how long passed since the beginning of the superCluster
    train['timeSinceSCStart'] = (train['sendingTime'] - train.groupby('superClusterId')['sendingTime'].transform('first')) / 1000

    train['isReaction'] = train['ltcyAfter'] < latency_thresh

    # Renumber the superCluster
    train = renumber_scId(train)


    #Create target based on the price movement in the Dax.
    train['target'] = train.simpleMid_afp - train.simpleMidBefore

    #Calculate the latency difference
    train['ltcyDiff'] = train['ltcyAfterPlus']-train['ltcyBefore']

    #Sell all targets with no reaction to 0
    train.loc[(train.isReaction == False),'target'] = 0

    #Cast all the sendingtimes to Timestamps
    train.sendingTimeFullStr = train.sendingTimeFullStr.apply(lambda x:datetime.strptime(x[0:-1], '%Y-%m-%d %H:%M:%S.%f'))

    return train

### Load many files in a range of dates

In [6]:
def load_range(fromDate = -1, toDate = -1, list_columns=[], save = False, my_path = "C:/Barak/SignalToLocal/", reaction_thresh = 37000000):
    """ Load and concatenate a series of files in a date range.
        Option to save the concatenated files to the working directory.
        list_columns = List of features to add to the default list. Default loads all columns.
        fromDate = YYYYMMDD
        toDate = YYYYMMDD
        Default value for both is all files in the directory.
        Default save = False
        Default path is C:/Barak/SignalToLocal/
        Default reaction_thresh = 37000000"""
    
    default_list = ['scId', 'superClusterEvent', 'timeSinceSCStart', 'sendingTimeFullStr', 'target']
    my_columns = default_list + list_columns
    
    concat_data = pd.DataFrame()
    first_date = 99999999
    last_date = -1
    last_scId = 0
    for f in listdir(my_path):
        if os.stat(my_path + f).st_size > 10000:
            date = int(f.replace("sigsToLocs.", "").replace(".csv", ""))
            if ((fromDate == -1 & toDate == -1) or (fromDate <= date & date <= toDate)):
                new_data = load_data_regression(my_path, f, reaction_thresh)
                
                new_data['scId'] += last_scId
                last_scId = new_data.iloc[-1, :]['scId']
                                
                if list_columns==[]:
                    concat_data = pd.concat([concat_data, new_data])
                else:
                    concat_data = pd.concat([concat_data, new_data[my_columns]])
                    
                first_date = min(first_date, date)
                last_date = max(last_date, date)

    
    concat_data = renumber_scId(concat_data)
    
    #save all files concatenated
    if save:
        # add dir creation or deprecate function if not used
        # no relative paths!!!
        concat_data.to_csv('Loaded_Data/loaded_' + str(first_date) + "_" + str(last_date) + ".csv")
    
    return concat_data

### Mini processing functions

In [7]:
def renumber_scId(data):

    # Renumber the superCluster
    if "superClusterId" in data.columns:
        firstInSc = data.superClusterId.diff().fillna(1) 
    else:
        firstInSc = data.scId.diff().fillna(1) 
        
    firstInSc[firstInSc != 0] = 1
    data['scId'] = firstInSc.cumsum().astype(int)
    
    return data

In [8]:
def filter_time_range(data, start_hour = 15, end_hour = 16):
    """Keep only the rows within a time range
    Default start_hour = 15 and end_hour = 16"""
    data_copy_full     = data #.copy()
    parsed_timestamps  = data_copy_full.sendingTimeFullStr.apply(lambda x: x.hour)
    data_copy_filtered = data_copy_full[(start_hour <= parsed_timestamps) & (parsed_timestamps <= end_hour)]
       
    # Renumber the superCluster
    data_copy_filtered = renumber_scId(data_copy_filtered)

    return data_copy_filtered

In [9]:
def filter_short_sc(data, sc_thresh):
    
    # id like to think this next line should be removable, but not yet.
    data = renumber_scId(data)
    eventsByScId = pd.DataFrame(data.groupby(['scId']).superClusterEvent.max() > sc_thresh)
    filtered = eventsByScId.iloc[data.scId - 1]
    
    #    filtered['signalId'] = data.index
    filtered.insert(1, "signalId", data.index, True) 
    filtered.set_index('signalId')

    # Filter out the short clusters
    data = data[list(filtered.superClusterEvent)]
    
    # Renumber the superCluster
    data = renumber_scId(data)
    
    return data

In [10]:
def filter_no_moves(data, thresh = 0):
    """remove all superclusters with insignificant target moves"""
    temp = data #.copy()

    temp['absTarget'] = temp.target.abs()
    nonTrivialScIds = pd.DataFrame(temp.groupby(['scId']).absTarget.max() > thresh)
    filtered = nonTrivialScIds.iloc[temp.scId - 1]
    filtered.reset_index(level=0, inplace=True)
    #    filtered['signalId'] = data.index
    filtered.insert(1, "signalId", temp.index, True) 
    filtered.set_index('signalId', inplace=True)

    # Filter out the super clusters where the target doesn't move enough
    temp = temp[filtered.absTarget]
    temp.pop('absTarget')    
    
    # Renumber the superClusters
    temp = renumber_scId(temp)
    
    return temp

In [11]:
def filter_time_since_sc_start(data, time_since_sc_start_thresh):
    """Remove all rows below a timeSinceSCStart threshold = time_thresh"""
    data = data[data['timeSinceSCStart'] < time_since_sc_start_thresh]
    
    # Renumber the superClusters
    data = renumber_scId(data)
    
    return data

### Process a raw DataFrame of data

In [12]:
def process_data(data, max_sc_duration=3000, sc_thresh=3):
    """
    This function removes the short superclusters below a certain threshold=sc_thresh,
    filters out superclusters with only 0s as targets,
    removes all rows with a timeSinceSCStart below a threshold
    """
    
    origsize = data.shape[0]
    print("Processing data...")
    
    #FILTER OUT SHORT SC
    print("Filtering super clusters shorter than " + str(sc_thresh))
    data = filter_short_sc(data, sc_thresh)
    
    #FILTER OUT SC with NO MOVES in target.
    print("Filtering non-move causing super clusters")
    data = filter_no_moves(data)

    #Filter for time between 3 and 4 pm and remove the long cluster rows.
    print("Filtering hours")
    data = filter_time_range(data)
    
    #Filter out all rows beyond a max_sc_duration.
    print("Filter all events within super cluster after " + str(max_sc_duration))
    data = filter_time_since_sc_start(data, max_sc_duration)
    
#     print(str(data.shape[0]) + " left out of " + str(origsize))
    
    return data

### Load and process a range of dates

In [13]:
def load_process_range(fromDate = -1, toDate = -1, list_columns=[], save = False, my_path = "C:/Barak/SignalToLocal/",  max_sc_duration=3000, sc_thresh=3, reaction_thresh = 37000000, save_data_path = "C:/Barak/Python/Processed_Data/"):
    """Does the same as load_range but also processes the data according to the default or specified parameters. """
    default_list = ['scId', 'superClusterEvent', 'timeSinceSCStart', 'sendingTimeFullStr', 'target']
    
    # TODO make sure this handles column duplicates
    my_columns = default_list + list_columns
    
    concat_data = pd.DataFrame()
    first_date = 99999999
    last_date = -1
    last_scId = 0
    for f in listdir(my_path):
        # Filter out small files (10 KB)
        if os.stat(my_path + f).st_size > 10000:
            date = int(f.replace("sigsToLocs.", "").replace(".csv", ""))
            if ((fromDate == -1 & toDate == -1) or (fromDate <= date & date <= toDate)):
                new_data = load_data_regression(my_path, f, reaction_thresh)
                
                # Add the last scId value to all the scId columns in the newly loaded file to keep monotonicity
                new_data['scId'] += last_scId
                

                last_scId = new_data.iloc[-1, :]['scId']               
                
                
                new_data = process_data(new_data, max_sc_duration, sc_thresh)
                
                # Select the requested columns from the loaded file only
                if list_columns == []:
                    concat_data = pd.concat([concat_data, new_data])
                else:
                    concat_data = pd.concat([concat_data, new_data[my_columns]])
                    
                first_date = min(first_date, date)
                last_date  = max(last_date , date)

                #    if 
#     concat_data = process_data(concat_data, max_sc_duration, sc_thresh)
    # save all loaded concatenated data
    if save:
        save_file_name = "processed_" + str(first_date) + "_" + str(last_date) + ".csv"
        print("saving " + save_file_name+ " to " + save_data_path)
            
        if not os.path.exists(save_data_path):
            os.makedirs(save_data_path)

        concat_data.to_csv(save_data_path + save_file_name)
        print("Done saving " + save_file_name)
        
    return concat_data

In [14]:
def remove_outliers(X_train, y_train, y_thresh = 2, sigMSP_thresh = 6, sigMND_thresh = 20, sigMDJ_thresh = 10):
    "Remove all the outliers by default value"
    
    volMSP_thresh = 999
    volMDJ_thresh = 120
    volMND_thresh = 400
    
    in_bounds = ((y_train.abs() <= y_thresh) & \
                 (X_train.signalMSP.abs() <= sigMSP_thresh) & \
                 (X_train.signalMND.abs() <= sigMND_thresh) & \
                 (X_train.signalMDJ.abs() <= sigMDJ_thresh) & \
                 (X_train.signedVolMSP.abs() <= volMSP_thresh) & \
                 (X_train.signedVolMDJ.abs() <= volMDJ_thresh) & \
                 (X_train.signedVolMND.abs() <= volMND_thresh))

    X_train_no_outliers = X_train[in_bounds]
    y_train_no_outliers = y_train[in_bounds]
    
    
    return X_train_no_outliers, y_train_no_outliers

### Fetch the data required for modeling

In [15]:
def get_modeling_data(my_path_stored_data = "C:/Barak/Python/Processed_Data/", fromDate = -1, toDate = -1, list_columns=[], save = False, my_path_fetch_data = "C:/Barak/SignalToLocal/",  max_sc_duration=3000, sc_thresh=3, reaction_thresh = 37000000):

    # Load all files in the processed_Data directory
    
    if not os.path.exists(my_path_stored_data):
        os.makedirs(my_path_stored_data)
    
    files_in_stored_directory = get_file_list(my_path_stored_data) 
    # Load all files in the signalToLocal directory
    
    files_in_fetch_directory = get_file_list(my_path_fetch_data)
    # Get file string of desired file
    processed_filename = "processed_" + str(fromDate) + "_" + str(toDate) +'.csv'
    
    # Condition if all files are requested
    if fromDate == -1 and toDate == -1:
        firstfile = files_in_fetch_directory[0] 
        lastfile = files_in_fetch_directory[-1]
        
        first_date = firstfile[-12:-4]
        last_date = lastfile[-12:-4]
        
        print("Processing files between " + str(first_date) + " and " + str(last_date))
        processed_filename = "processed_" + str(first_date) + "_" + str(last_date) + '.csv'
        
    if ((save == False) & (processed_filename in files_in_stored_directory)):
        print("Loading saved data " + processed_filename + " from directory " + my_path_stored_data + " ...")
        import_data = pd.read_csv(my_path_stored_data + processed_filename)
    else:
        print("Processing data between " + str(fromDate) + " and " + str(toDate) + " from directory " + my_path_fetch_data + " from scratch...")
        import_data = load_process_range(fromDate = fromDate, toDate = toDate, list_columns = list_columns, save = False, my_path = my_path_fetch_data, max_sc_duration = max_sc_duration, sc_thresh=sc_thresh, reaction_thresh = reaction_thresh)
        
        if save:
            processed_filename = "processed_" + str(fromDate) + "_" + str(toDate) + ".csv"
            print("Saving " + processed_filename + " to directory " + my_path_stored_data + " ...")
            if not os.path.exists(my_path_stored_data):
                os.makedirs(my_path_stored_data)

            import_data.to_csv(my_path_stored_data + processed_filename)
    print("Done loading files!")
    
    return import_data

### Split Data

In [16]:
def data_split_regression(df, test_ratio):
    """Splits the data by time series split based on a test_ratio proportion
    test_ratio = valid_ratio
    train_ratio = 1 - test_ratio - valid_ratio"""
    
    train_size = int(df.shape[0] * (1 - 2 * test_ratio))
    valid_size = int(df.shape[0] * test_ratio)
    
    X_train = df.iloc[:train_size,:].drop(columns="target")
    X_valid = df.iloc[train_size:train_size + valid_size, :].drop(columns="target")
    X_test = df.iloc[train_size + valid_size:, :].drop(columns="target")
    y_train = df.iloc[:train_size,:]['target']
    y_valid = df.iloc[train_size:train_size + valid_size,:]['target']
    y_test = df.iloc[train_size + valid_size:,:]['target']
    return X_train, X_test, X_valid, y_train, y_valid, y_test

# Data Exploration

### Plot functions

In [17]:
def plot_sc(data, toPlot):
    toShow = data[data.scId == toPlot]

    plt.subplot(211)
    plt.plot(toShow.timeSinceSCStart , toShow.signalMSP, color='red', marker='.')
    plt.plot(toShow.timeSinceSCStart , toShow.signalMDJ, color='green', marker='.')
    plt.plot(toShow.timeSinceSCStart , toShow.signalMND, color='blue', marker='.')
    plt.title(str(toPlot) + ' @ ' + toShow.sendingTimeFullStr.iloc[0])

    plt.subplot(212)
    plt.plot(toShow.timeSinceSCStart , toShow.signedVolMSP, color='red', marker='.')
    plt.plot(toShow.timeSinceSCStart , toShow.signedVolMDJ, color='green', marker='.')
    plt.plot(toShow.timeSinceSCStart , toShow.signedVolMND, color='blue', marker='.')
    plt.show()

In [18]:
def cap_by_quantile(toCap, quant):
    thresh = 0.1
    print('filtered ' + str((abs(toCap) <= thresh).sum()) + ' out of ' + str(toCap.count()))
    toCap = toCap[abs(toCap) > thresh]
    cap = toCap.abs().quantile(quant)
    capped = sum(abs(toCap) > cap)
    toCap[abs(toCap) > cap] = cap
    sns.distplot(toCap)
    plt.title(str(quant) + '% -> ' + str(cap))
    print('capped ' + str(capped) + ' out of ' + str(toCap.count()))
    print(toCap.describe().apply(lambda x: format(x, 'f')))
    return toCap

In [19]:
# import seaborn as sns
# import matplotlib.pyplot as plt
# import sklearn
# from sklearn.cluster import DBSCAN
# from collections import Counter


# def dbscan_outlier_plot(X_train, y_train, field):
#     """data: pandas dataframe
#        field: field in dataframe with outliers to be removed."""
#     dbs_model = DBSCAN(eps=0.05, min_samples=50).fit(X_train[field])
#     colors = dbs_model.labels_
#     f = plt.figure()
#     plt.scatter(X_train[field], y_train, c = dbs_model, s=18)

### Correlation Matrix functions

In [20]:
def cor_nolabels(train):
    "Plot the correlation matrix of all the features and without the correlation values on them"
    f, ax = plt.subplots(figsize=(10, 8))
    corr = train.corr()
    sns.heatmap(corr, mask=np.zeros_like(corr, dtype=np.bool), cmap=sns.diverging_palette(220, 10, as_cmap=True),
                square=True, ax=ax)

In [21]:
def cor_labels(train):
    "Plot the correlation matrix of all the features and the correlation values on them"
    f = plt.figure(figsize=(15,7))
    ax = sns.heatmap(train.corr(), annot=True, fmt=".2f")

### Plot labeled data

In [22]:
def plot_labeled(X, Y, predict_labels, my_labels, colors, x_axis, y_axis, plot_title):
    """Plots a scatter of 2 features and their colors depending on their labels.
    X = Data in X Axis
    Y = Data in Y Axis
    predict_labels = the list of every points label
    colors=["red", "blue"]
    my_labels=[0,1]
    x_axis = Title of x axis
    y_axis = Title of y axis
    plot_title -= Title of the plot."""
    f = plt.figure()

    plt.scatter(X, Y, c=predict_labels, cmap=ListedColormap(colors));
    plt.grid(b=True, which='major', color='#666666', linestyle='-')
    plt.xlabel(x_axis)
    plt.ylabel(y_axis)
    plt.title(plot_title)
    cb = plt.colorbar()
    loc = np.arange(0,max(predict_labels),max(predict_labels)/float(len(colors)))
    cb.set_ticks(loc)
    cb.set_ticklabels(my_labels)

### Plot distributions of each feature

In [23]:
def plot_continuous_distributions(X_train, continuous_features):
    """Plots all the distributions in continuous_features which is a subset of the features in X_train"""

    for my_feature in tqdm(continuous_features):
        f = plt.figure(figsize=(10,5))
        sns.distplot(X_train[my_feature], kde=True, hist=False, 
        color = 'darkblue', 
        kde_kws={'linewidth': 4}).set_title("Density plot of {}".format(my_feature)); 
        plt.ylabel('Density')
        plt.xlabel(my_feature,fontsize = 15)
    
        # Show the grid lines as dark grey lines
        plt.grid(b=True, which='major', color='#666666', linestyle='-')

        plt.show()

In [24]:
def plot_categorical_distributions(X_train, categoric_features): 
    """Plots all the histograms in categoric_features which is a subset of the features in X_train"""
    for my_feature in tqdm(categoric_features):
        f = plt.figure(figsize=(10,5))
        plt.style.use('ggplot')
        my_dist= X_train[my_feature].value_counts()
        x = my_dist.index
        my_counts = my_dist.values
        x_pos = [i for i, _ in enumerate(x)]
        plt.bar(x_pos, my_counts, color='blue')
        plt.xlabel(my_feature,fontsize = 15)
        plt.ylabel("{} Frequency".format(my_feature), fontsize = 15)
        plt.title("Distribution of {}".format(my_feature),fontsize = 18)
        plt.xticks(x_pos, x, rotation='vertical')
        
        # Show the grid lines as dark grey lines
        plt.grid(b=True, which='major', color='#666666', linestyle='-')
        plt.show()

### Plot of relationship between features and dependnt variable

In [25]:
def plot_continuous_vs_target(X_train, y_train, continuous_features):
    """Plots all the features in continuous_features vs the target"""
    for my_feature in tqdm(continuous_features):
        f = plt.figure()
        plt.scatter(X_train[my_feature], y_train, color='blue');
        plt.ylabel('Target');
        plt.xlabel(my_feature);

        # Show the grid lines as dark grey lines
        plt.grid(b=True, which='major', color='#666666', linestyle='-')
        plt.title("Target vs. {}".format(my_feature),fontsize = 18)
        
        plt.show()

### Plot with boundaries

In [26]:
# ax = plt.subplot(111)
# ax.plot(np.arange(0,1,0.005)[0:100], train_freq[0:100], label = "train")
# ax.plot(np.arange(0,1,0.005)[0:100], valid_freq[0:100], label = "valid")
# ax.figure.set_size_inches(10, 10)
# plt.xlabel('Cutoff')
# plt.ylabel('Trade Freq')
# plt.title('Plot of Trade Freq vs cutoff')
# plt.axis('tight')
# plt.legend(bbox_to_anchor=(1, 1), ncol=2)
# plt.show()

# Modeling

In [27]:
def full_modeling_process(my_model, test_thresh, predict_thresh, my_path_stored_data = "C:/Barak/Python/Processed_Data/", fromDate = -1, toDate = -1, list_columns=[], save = False, my_path_fetch_data = "C:/Barak/SignalToLocal/",  max_sc_duration=3000, sc_thresh=3, reaction_thresh = 37000000, test_size = 0.2, my_labels = [-1,0,1]):
    """Runs an entire modeling process with no need for pre processing.
    -Imports and processes the data in the dat range.
    -Splits the data
    -Removes the outliers.
    -Fits the model
    -Predicts and evaluates on the test set
    Returns the model, the train and test data and its predicted and test labels"""
    #Import data
    import_data = get_modeling_data(my_path_stored_data = my_path_stored_data, fromDate = fromDate, toDate = toDate, list_columns=list_columns, save = save, my_path_fetch_data = my_path_fetch_data,  max_sc_duration= max_sc_duration, sc_thresh=sc_thresh, reaction_thresh = reaction_thresh)
    
    #Split data
    print("Splitting data...")
    #fix for split function
    test_size=test_size/2
    X_train, X_test, X_valid, y_train, y_valid, y_test = data_split_regression(import_data, test_size)
    X_test = pd.concat([X_test, X_valid])
    y_test = pd.concat([y_test, y_valid])
    #Remove outliers
    print("Removing Outliers...")
    X_train_no_outliers, y_train_no_outliers = remove_outliers(X_train, y_train)
    
    #Fit model
    print("Fitting Model...")
    my_model.fit(X_train_no_outliers[list_columns], y_train_no_outliers)
    
    print("Predicting on test...")
    y_pred = my_model.predict(X_test[list_columns])
    
    y_test_labels = make_regression_labels(test_thresh, y_test)
    y_pred_labels = make_regression_labels(predict_thresh, y_pred)
    print("----------------------")
    print_precisions(y_test_labels, y_pred_labels)
    
    print_conf_matrix(y_test_labels, y_pred_labels, my_labels)
    
    print("Done!")
    return my_model, X_train_no_outliers, y_train_no_outliers, X_test, y_test, y_pred_labels, y_test_labels
        

### Save model for Java

In [28]:
#def get_modeling_data(my_path_stored_data = "C:/Barak/Python/Processed_Data/", fromDate = -1, toDate = -1, list_columns=[], save = False, my_path_fetch_data = "C:/Barak/SignalToLocal/",  max_sc_duration=3000, sc_thresh=3, reaction_thresh = 37000000):
from nyoka import skl_to_pmml
from nyoka import xgboost_to_pmml
from sklearn2pmml import sklearn2pmml
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import make_pmml_pipeline

def pmml_export(my_model, X_train, filename, pmml_path = "C:/Barak/Python/PMML/", downgrade = True, save_pickle = True, use_nyoka = False):
    """Model must be trained and in an sklearn Pipeline.
       X_train and y_train must be in pandas format."""
    
    if not os.path.exists(pmml_path):
        os.makedirs(pmml_path)
    
    full_filename = pmml_path + filename + ".pmml"

    target = "prediction"
    print("Saving PMML to " + full_filename)
    # check https://pypi.org/project/nyoka/
    if use_nyoka == True:
        if "XGB" in filename:
            xgboost_to_pmml(my_model, X_train.columns.values, target, full_filename)
        else:
            skl_to_pmml(my_model, X_train.columns.values, target, full_filename)
        
        if downgrade:
            print("Downgrading PMML to 4.3")
            with open(full_filename, "rt") as fin:
                with open(full_filename, "wt") as fout:
                    for line in fin:
                        fout.write(line.replace('"4.4"', '"4.3"').replace('-4_4', '-4_3'))

            # remove non-downgraded
            os.remove(full_filename)
    else:
        pipeline = make_pmml_pipeline(my_model, X_train.columns.values, target)
        sklearn2pmml(pipeline, full_filename)
        
    if save_pickle:
        save_model(my_model, filename)

### Save model to pickle

In [29]:
def save_model(my_model, filename, filepath = "C:/Barak/Python/FinalizedModels/"):
    "Saves my_model to a pickle file in the woring directory with the name filename."
    if not os.path.exists(filepath):
        os.makedirs(filepath)
        
    print("Saving " + filename + " to " + filepath)
    pickle.dump(my_model, open(filepath + filename + ".sav", 'wb'))

### Load model from sav file

In [30]:
def load_model(filename, filepath = "C:/Barak/Python/FinalizedModels/"):
    "Loads a pickled model in filename"
  
    print("Loading " + filename + " from " + filepath)
    loaded_model = pickle.load(open(filepath + filename, 'rb'))
    return loaded_model

### Precision functions

In [31]:
def bar_plot(my_labels, x_axis='Profitability'):
    df = pd.DataFrame(my_labels, columns=[x_axis])
    ax = sns.countplot(y=x_axis, data=df)
    plt.title('Distribution of {}'.format(x_axis))
    plt.xlabel('Number of {} Transactions'.format(x_axis))

    total = len(df[x_axis])
    for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height() / 2
        ax.annotate(percentage, (x, y))

    plt.show()

#### Precision Function - FP : Predict 1 or -1 and get 0 OR predict 1 and get -1

In [32]:
def normal_precision(y_true, y_pred):
    """ Precision Function - FP : Predict 1 or -1 and get 0 OR predict 1 and get -1 OR predict -1 and get 1"""
    predictions  = (y_pred != 0).sum()
    correct_pred = (y_true * y_pred == 1).sum()
    return 0 if predictions == 0 else correct_pred / predictions

#### Precision Function - FP : Predict 1 or -1 and get opposite

In [33]:
def penalty_precision(y_true, y_pred):
    """Precision Function - FP : Predict 1 or -1 and get opposite"""
    predictions = (y_pred != 0).sum()
    wrong_pred  = (y_true * y_pred == -1).sum()
    return 0 if predictions == 0 else wrong_pred / predictions

#### Precision Function - FP : Predict 1 or -1 and get 0

In [34]:
def indiff_precision(y_true, y_pred):
    """Precision Function - FP : Predict 1 or -1 and get 0"""
    predictions  = (y_pred != 0).sum()
    false_pred   = ((y_pred != 0) & (y_true == 0)).sum()
    return 0 if predictions == 0 else false_pred / predictions

In [35]:
def utilization_precision(y_true, y_pred):
    """Precision Function - FP : Predict 1 or -1 and get 0"""
    predictions   = (y_pred != 0).sum()
    opportunities = (y_true != 0).sum()
    return 0 if opportunities == 0 else predictions / opportunities

#### Print all the precisions

In [36]:
def print_precisions(y_test, y_pred):
    """Print all 3 precisions where y_test = y_true and y_pred is the model predictions"""
    nor_prec = normal_precision(y_test, y_pred)
    pen_prec = penalty_precision(y_test, y_pred)
    ind_prec = indiff_precision(y_test, y_pred)
    utl_prec = utilization_precision(y_test, y_pred)
    print("Utilization:           {:,.3f}".format(utl_prec))
    
    print("Correct Precision:     {:,.3f}".format(nor_prec))
    print("Mistaken Precision:    {:,.3f}".format(pen_prec))
    print("Indifferent Precision: {:,.3f}".format(ind_prec))

### Plot the precisions

In [37]:
def plot_precisions(max_depth_list, n_est_list, train_penalty_dict, valid_penalty_dict):
    ax1 = plt.subplot(111)
    for n in n_est_list:
        ax1.plot(max_depth_list[1:], train_penalty_dict[n][1:], label = "n_est: {}".format(n))
    ax1.figure.set_size_inches(10, 10)
    plt.xlabel('Max Depth')
    plt.ylabel('Penalty Precision')
    plt.title('TRAIN: Plot of Penalty Precision vs Max depth for several Number of estimators for a Random Tree model')
    plt.axis('tight')
    plt.legend(bbox_to_anchor=(1, 1), ncol=2)
    plt.show()
    
    ax2 = plt.subplot(111)
    for n in n_est_list:
        ax2.plot(max_depth_list[1:], valid_penalty_dict[n][1:], label = "n_est: {}".format(n))
    ax2.figure.set_size_inches(10, 10)
    plt.xlabel('Max Depth')
    plt.ylabel('Penalty Precision')
    plt.title('VALIDATION: Plot of Penalty Precision vs Max depth for several Number of estimators for a Random Tree model')
    plt.axis('tight')
    plt.legend(bbox_to_anchor=(1, 1), ncol=2)
    plt.show()

#### Parameter Optimization

In [38]:
def randomforest_tuning(X_train, y_train, X_valid, y_valid, n_est_list, max_depth_list, list_columns, true_thresh=0, predict_thresh=0.2):
    """Tunes the hyperparameters of a random forest regressor."""
    train_penalty_dict = {}
    valid_penalty_dict = {}

    train_normal_dict = {}
    valid_normal_dict = {}

    train_indiff_dict = {}
    valid_indiff_dict = {}

    for n in tqdm(n_est_list):
        train_penalty_dict[n] = []
        valid_penalty_dict[n] = []

        train_normal_dict[n] = []
        valid_normal_dict[n] = []

        train_indiff_dict[n] = []
        valid_indiff_dict[n] = []

        for my_depth in max_depth_list:

            clf = RandomForestRegressor(n_estimators=n, criterion="mse", max_depth=my_depth)
            clf.fit(X_train[list_columns], y_train)

            y_pred_train = clf.predict(X_train[list_columns])
            y_pred_valid = clf.predict(X_valid[list_columns])

            y_train_labels =  make_regression_labels(true_thresh, y_train)
            y_pred_train_labels = make_regression_labels(predict_thresh, y_pred_train)

            y_valid_labels =  make_regression_labels(true_thresh, y_valid)
            y_pred_valid_labels = make_regression_labels(predict_thresh, y_pred_valid)

            train_penalty_dict[n].append(penalty_precision(y_train_labels, y_pred_train_labels))
            valid_penalty_dict[n].append(penalty_precision(y_valid_labels, y_pred_valid_labels))

            train_normal_dict[n].append(normal_precision(y_train_labels, y_pred_train_labels))
            valid_normal_dict[n].append(normal_precision(y_valid_labels, y_pred_valid_labels))

            train_indiff_dict[n].append(indiff_precision(y_train_labels, y_pred_train_labels))
            valid_indiff_dict[n].append(indiff_precision(y_valid_labels, y_pred_valid_labels))

        print("train: n_est={}, score:{}".format(n, train_penalty_dict[n]))
        print("valid: n_est={}, score:{}".format(n, valid_penalty_dict[n]))
        print("----------------------------------------")
    print("----------------------------------------")
    print("----------------------------------------")
    return train_penalty_dict, valid_penalty_dict, train_normal_dict, valid_normal_dict, train_indiff_dict, valid_indiff_dict

In [39]:
def test_training_days(my_model, X_train, y_train, X_test, y_test, my_features, n_step=1):
    "Make sure to include the sendingTimeFullStr in the Training set"
    pen_prec = []
    norm_prec=[]
    indiff_prec = []
    
    start_date = X_train.loc[:, 'sendingTimeFullStr'][0]
    end_date = X_train.loc[:, 'sendingTimeFullStr'][-1]
    
    delta = start_date - end_date
    
    total_days = delta.days

    for n_days in tqdm(range(1, total_days, n_step)):
        
        date_cutoff = end_date - datetime.timedelta(days=n_days)
        
        date_filter = X_train.sendingTimeFullStr>=date_cutoff
        X_train_temp = X_train.iloc[date_filter][my_features]
        y_train_temp = y_train.iloc[date_filter]

        my_model.fit(X_train_temp, y_train_temp)
        y_pred = my_model.predict(X_test[my_features])
        y_test_labels =  make_regression_labels(0,y_test)
        y_pred_labels = make_regression_labels(0,y_pred)

        pen_prec.append(penalty_precision(y_test_labels,y_pred_labels))
        norm_prec.append(normal_precision(y_test_labels,y_pred_labels))
        indiff_prec.append(indiff_precision(y_test_labels,y_pred_labels))
        
    return pen_prec, norm_prec, indiff_prec

### Confusion Matrix

In [40]:
def print_conf_matrix(y_test, y_pred, my_labels):
    ax = plt.subplot()
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot = True, fmt = "d")
    ax.set_xlabel('Predicted labels')
    ax.set_ylabel('True labels')
    ax.xaxis.set_ticklabels(my_labels)
    ax.yaxis.set_ticklabels(my_labels)
    
    ax.set_title('Confusion Matrix')


In [41]:
def plot_labeled_many(X, Y, predict_labels, continuous_features, my_labels, colors):
    """Plots a scatter of 2 features and their colors depending on their labels.
    X = Data in X Axis
    Y = Data in Y Axis
    predict_labels = the list of every points label
    colors=["red", "blue"]
    my_labels=[0,1]
    x_axis = Title of x axis
    y_axis = Title of y axis
    plot_title -= Title of the plot."""
    
    for my_feature in tqdm(continuous_features):
        f = plt.figure()
        plt.scatter(X[my_feature], Y, c=predict_labels, s=10, cmap=ListedColormap(colors));
        plt.grid(b=True, which='major', color='#666666', linestyle='-')
        plt.xlabel(my_feature)
        plt.ylabel("True target")
        plt.title("Target vs {}".format(my_feature))
        cb = plt.colorbar()
        loc = np.arange(0, max(predict_labels), max(predict_labels) / float(len(colors)))
        cb.set_ticks(loc)
        cb.set_ticklabels(my_labels)

### Miscellaneous code

In [42]:
# deriv_df = pd.DataFrame()
# deriv_df['time'] = firsts.sendingTime[shft:]
# deriv_df['speed'] = speed[shft:]
# time_cutoff = deriv_df[deriv_df.speed == deriv_df.speed.max()].time
# new_train = train[train.sendingTime>=int(time_cutoff)]

In [43]:
# import pyodbc
# def queryFromDB(DB_DATABASE, bunch, date, startTime, endTime):
#     DB_USER = 'sa'
#     DB_PASS = 'barak123!@#'
#     DB_SERVER = '192.168.50.102'

#     cnx_params = "Driver={SQL Server};" + """Server={0};
#                       UID={1};
#                       PWD={2};
#                       Database={3};""".format(DB_SERVER,
#                                               DB_USER,
#                                               DB_PASS,
#                                               DB_DATABASE)
#     cnx = pyodbc.connect(cnx_params, autocommit=True)

#     query = "spGetMarket @startingTime='" + startTime + "', @endingTime='" + endTime + "', @bunch='" + bunch + "', @date='" + date + "', @columns='gwRequestTime, sendingTime, sendingTimeFullStr, eventId, sequence, event, simpleMid, mid' "
#     target = pd.read_sql(query, cnx)

#     return target