__NOTE__: We should only use this on the training data after we have split.

If we do this before we do any train/test splitting, we need to ensure that all duplicated values end up in training. We should strive to preserve our test set to be actual process dynamics as much as possible... but we can mess with the train set a little bit more.

In [10]:
import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os

import warnings
warnings.filterwarnings('ignore')

In [26]:
def oversample_extremes(data, low_val=42, low_weight=2, high_val=48, high_weight=2):
    '''
    A function that will create duplicate rows of specified rows in order to weight certain rows in machine learning training
    
    INPUTS:
    data: a pandas dataframe
    low_val: value of 'Quality' below which we will duplicate rows
    low_weight: integer of effective weight desired for low values
    high_val: value of 'Quality' above which we will duplicate rows
    high_weight: integer for effective weight desired for high values
    
    RETURNS:
    newdata: a pandas dataframe with duplicated rows according to the arguments passed
    
    NOTES:
    
    'low_weight' and 'high_weight' both need integer values and correspond to the effective weight of the sample compared
    to the rest of the data in the dataframe. For an example, a weight of 2 would mean creating 1 duplicate (so the
    row is now in the data twice), and a weight of 1 would return the same dataset (no additional duplicates created)
    
    Default cutoff values correspond to roughly anything outside the IQR
    '''
    # identify rows of interest
    low_df = data.loc[data['Quality'] < low_val]
    high_df = data.loc[data['Quality'] > high_val]
    
    newdata = data.copy()
    
    # loop to add on all the rows
    
    for i in range(low_weight-1): #pythonic indexing...
        newdata = newdata.append(low_df, ignore_index = True)
    for i in range(high_weight-1):
        newdata = newdata.append(high_df, ignore_index = True)
    
    return(newdata)
    

In [27]:
data = pd.read_csv("../../../datasets/anonymized_SAP_data.csv")

In [28]:
data['Quality'].describe()

count    2709.000000
mean       45.184728
std         1.886299
min        27.150000
25%        44.352500
50%        45.300000
75%        46.310000
max        50.510000
Name: Quality, dtype: float64

In [29]:
oversampled = oversample_extremes(data)

In [30]:
oversampled['Quality'].describe()

count    2977.000000
mean       45.088875
std         2.248073
min        27.150000
25%        44.210000
50%        45.290000
75%        46.392500
max        50.510000
Name: Quality, dtype: float64

In [32]:
data.loc[data['Quality'] < 42].shape

(148, 23)

In [34]:
data.loc[data['Quality'] > 48].shape

(120, 23)

In [35]:
2709 + 148 + 120

2977

Function is working how we want it to

In [36]:
oversampled = oversample_extremes(data, high_weight = 3)
oversampled.shape

(3097, 23)