In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

%matplotlib inline

#### Reading-In and Cleaning Up the Data

The first thing to do is to take a hold-out set. This is different from the training and testing set in that it will only be used at the very end of our work in order to select  between the various models in sklearn using the training and testing set.

Using the train/test parlance, the hold-out set might be considered the "exam" data. For this, I will select 10% of the data.

In [2]:
data_path = './office_supply.csv'
df = pd.read_csv(data_path)
holdout_indices = np.random.choice(df.index, replace=False, size=round(len(df.index)*.1))
holdout_indices[:10]

array([25192, 12077, 19707, 17760, 45501, 44784,  6862, 38064, 45819,
        5265], dtype=int64)

With the inidices for the holdout data in hand, I will read in all the data and separate the holdout-data from the training data.

In [3]:
holdout_data = df.loc[holdout_indices,:]
print("The length of the holdout data: ",len(holdout_data))
training_data = df.drop(holdout_indices, axis='rows')
# dropping the indices of the holdout data from the full dataframe leaves the training data
print("The length of the training data: ",len(training_data))

The length of the holdout data:  4641
The length of the training data:  41766


#### Below are the functions which will be used to preprocess the data:

In [9]:
# Function for preprocessing data
def office_preprocess(X,y):
    # Hard-code lists for dropping and to_bool
    # Dropped variables include dates and features with many missing values
    to_drop = ['date_of_last_transaction', 'date_of_first_purchase',
               'customer_number', 'language',
               'last_transaction_channel', 'number_of_employees']
    to_bool = ['desk', 'executive_chair', 'standard_chair',
               'monitor', 'printer','computer', 'insurance',
               'toner', 'office_supplies']
    # Hard-code values for notice, auto, and prem
    notice = "NOTICE"
    auto = "AUTO RENEW"
    prem = "Premier"

    # Function to convert and fill "Y/N" features
    def convert_fill_bool(val):
        if val == 'Y': return True
        else: return False

    # Function to encode the service as "premium" : true or false
    def encode_service(val):
        if val == prem: return True
        else: return False

    # Function to encode the repurchase feature into two columns: "notice" true/false and "auto_renew" true/false
    # "payment" plan implied by "false" in "notice" and "auto_renew" columns
    def encode_repurchase(series):

        def notice_encode(val):
            if val == notice: return True
            else: return False

        def auto_renew_encode(val):
            if val == auto: return True
            else: return False

        ser_notice = series.apply(notice_encode)
        ser_notice.name = "repurchase_notice"
        ser_auto = series.apply(auto_renew_encode)
        ser_auto.name = "repurchase_auto"

        return pd.concat([ser_notice, ser_auto], axis = 'columns')

    # Function to transform campaign_period_sales to a float
    def transform_target(raw):
        # make sure the value is initially cast as a string
        raw = str(raw)

        # determine if negative or not
        if raw.count("(") > 0: sign = -1
        else: sign = 1

        # remove all spaces, commas, dollar signs, and parentheses
        for to_rem in [" ",",","$", "(",")"]:
            raw = raw.replace(to_rem,"")
        return sign *float(raw)

    y_trans = y.apply(transform_target)

    X_trans = X.drop(to_drop, axis = 'columns')

    for col in to_bool:
        X_trans[col] = X_trans[col].apply(convert_fill_bool)

    X_trans['premier_service'] = X_trans['service_level'].apply(encode_service)
    X_trans.drop('service_level', axis = 'columns', inplace = True)

    repurch = encode_repurchase(X_trans['repurchase_method'])
    X_trans = pd.concat([X_trans.drop('repurchase_method', axis = 'columns'), repurch], axis = 'columns')

    return X_trans, y_trans

def rename_columns(df):
    df.columns = [col.strip().replace(' ', '_').lower() for col in df.columns]
    return df

def pull_out_target_pass_to_preprocess(df):
    # Pull out target and explanatory variables
    X = df.drop('campaign_period_sales', axis = 'columns')
    y = df['campaign_period_sales']

    X, y = office_preprocess(X,y)

    return pd.concat([y,X],axis = 'columns')

In [10]:
# Perform pre-processing on both holdout and fitting data
holdout_data = rename_columns(holdout_data)
holdout_data = pull_out_target_pass_to_preprocess(holdout_data)

training_data = rename_columns(training_data)
training_data = pull_out_target_pass_to_preprocess(training_data)

print(holdout_data.head(1))
print(training_data.head(1))


       campaign_period_sales  number_of_transactions  \
25192                 393.94                      18   

       do_not_direct_mail_solicit  do_not_email  do_not_telemarket  \
25192                       False         False              False   

       email_available   desk  executive_chair  standard_chair  monitor  \
25192             True  False            False           False    False   

       printer  computer  insurance  toner  office_supplies  premier_service  \
25192    False     False      False  False             True            False   

       repurchase_notice  repurchase_auto  
25192              False             True  
   campaign_period_sales  number_of_transactions  do_not_direct_mail_solicit  \
0                 107.16                      20                       False   

   do_not_email  do_not_telemarket  email_available   desk  executive_chair  \
0         False              False            False  False            False   

   standard_chair  monitor

Now I have the holdout data, which will be used for the final evaluation of the model -- indicating precisely how well the model will perform.

To select the model, I need to create the train_test_split. At this point I will separate out the target (y) and explanatory data (X):

In [12]:
X = training_data.drop('campaign_period_sales', axis = 'columns')
y = training_data['campaign_period_sales']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .25)


Finally I can proceed with model building. The target is a continuous variable - campaign period sales.

Regressors are used with data that is continuous.

Below, two tree models and a simple linear regression are   fit and used to predict on the test set of data.

In [13]:
# Instantiate Model
dt = DecisionTreeRegressor()
rf = RandomForestRegressor(n_estimators=100)
lr = LinearRegression()
# Fit Models
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
lr.fit(X_train, y_train)
# Find Score on Testing dAta
print("Decision Tree r2 Score:", dt.score(X_test, y_test))
print("Random Forest r2 Score:", rf.score(X_test, y_test) )
print("Linear Regression r2 Score:", lr.score(X_test, y_test))

Decision Tree r2 Score: 0.44270127917863533
Random Forest r2 Score: 0.49324829761428496
Linear Regression r2 Score: 0.4966728806947007


Only the Decision tree r2 score looks slightly worse.

I'm going to use the Random Forest model.

The next step will be to fit the Random Forest on both the training and testing data and <i>then</i> I can see how well it performs on the holdout data.

In [14]:
# Fitting model on ALL the training data
rf.fit(X,y)
# Splitting out target in the holdout data
holdout_target = holdout_data['campaign_period_sales']
holdout_explanitory = holdout_data.drop('campaign_period_sales', axis = 'columns')
# Finding r2 score
rf.score(holdout_explanitory, holdout_target)


0.5122608920418055

Notably, the model did not perform markedly better (or worse) on the holdout data. This is actually a good thing. At this point, I can be pretty confident that the Random Forest Model will account for between 45 and 50% of the variance in the sales data.

If the scores were inconsistent, that would most likely be a sign that the model was getting "lucky" on the data being picked in the holdout-set.


# Fitting Models in sklearn

In the above project, I went through multiple steps to clean the data, instantiate the model, fit the model to the data, and make predicitions or score the model.

### Step 1: Use pandas to clean and normalize the data.

### Step 2: Instantiate the model.

In [15]:
decision_tree = DecisionTreeRegressor()
random_forest = RandomForestRegressor(n_estimators=100)
linear_regression = LinearRegression()

### Step 3: Fit the model.

In [16]:
decision_tree.fit(X_train, y_train)
random_forest.fit(X_train, y_train)
linear_regression.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

### Step 4: Make Predictions and/or Scores.

In [17]:
print(".predict(): ", decision_tree.predict(X_test))
print(".score(): ", linear_regression.score(X_test, y_test))

.predict():  [ 433.58        355.67597826  392.11450292 ...  450.296      3922.53
  384.22448087]
.score():  0.4966728806947007
