In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as sm
import datetime
from sklearn.externals import joblib

In [2]:
df = pd.read_csv('./data/20190401_projects.csv')

In [3]:
df.head()

Unnamed: 0,project,case_type,number_pages,3d_modeling,hours,visio
0,1568503_111US8_IIG,design,6,no,2.75,no
1,847US1_DF_IIG,utility,16,no,11.0,no
2,7142_002_SMF_IIG,utility,5,no,3.75,no
3,843US1_CC_IIG,utility,9,no,5.75,no
4,34608_EG_IIG,utility,4,no,7.25,no


In [4]:
# Feature Engineering

# Create a new variable that records 'hours' / 'number_pages'
df['hour_page'] = df['hours'] / df['number_pages']

## Create difficulty column

In [5]:
# Create variables to store location of difficulty bins
level_one = 0.5
level_two = 1.25
level_three = 2.5
level_four = 3.75

In [6]:
def get_difficulty(row):
    difficulty = 0
    if row.hour_page < level_one:
        difficulty = 1
    elif (row.hour_page >= level_one) & (row.hour_page < level_two):
        difficulty = 2
    elif (row.hour_page >= level_two) & (row.hour_page < level_three):
        difficulty = 3
    elif (row.hour_page >= level_three) & (row.hour_page < level_four):
        difficulty = 4
    elif (row.hour_page >= level_four):
        difficulty = 5
    else:
        return difficulty

    return difficulty

In [7]:
# Create Difficulty column
df['difficulty'] = df.apply(get_difficulty, axis=1)

In [8]:
df.head()

Unnamed: 0,project,case_type,number_pages,3d_modeling,hours,visio,hour_page,difficulty
0,1568503_111US8_IIG,design,6,no,2.75,no,0.458333,1
1,847US1_DF_IIG,utility,16,no,11.0,no,0.6875,2
2,7142_002_SMF_IIG,utility,5,no,3.75,no,0.75,2
3,843US1_CC_IIG,utility,9,no,5.75,no,0.638889,2
4,34608_EG_IIG,utility,4,no,7.25,no,1.8125,3


## Modeling

In [9]:
# Make a copy of df
df2 = df.copy()

In [10]:
# Define binary categorical variables to be encoded
binary_categorical_to_encode = ['case_type', '3d_modeling']

In [11]:
def label_encode(dataset, column):
    """
    This will encode a binary categorical variable.
    Column needs to be a string
    """
    labelencoder_X = LabelEncoder()
    dataset[column] = labelencoder_X.fit_transform(dataset[column])
    return

In [12]:

# Label encode binary categorical columns
for variable in binary_categorical_to_encode:
    label_encode(df2, variable)

In [13]:
cols_to_use = ['number_pages', '3d_modeling', 'difficulty']

X = df2[cols_to_use]
y = df2.hours
y = y.values.reshape(-1,1)

#### Train Test Split

In [14]:
# Not doing the Train test split as we've tested this in the RStudios notebook. Here we're just constructing
# a model


# Splitting the dataset into the Training set and Test set
#from sklearn.model_selection import train_test_split
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, random_state = 0)

#### Fitting Multiple Linear Regression to Training Set

In [15]:
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [16]:
X.head(1)

Unnamed: 0,number_pages,3d_modeling,difficulty
0,6,0,1


In [17]:
X = np.append(arr = np.ones((len(X),1)).astype(int), values = X, axis = 1)
X_opt = X[:, [0, 1, 2, 3]]
regressor_OLS = sm.OLS(endog = y, exog = X_opt).fit()
regressor_OLS.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.76
Model:,OLS,Adj. R-squared:,0.755
Method:,Least Squares,F-statistic:,152.6
Date:,"Tue, 30 Apr 2019",Prob (F-statistic):,1.14e-44
Time:,13:53:13,Log-Likelihood:,-317.01
No. Observations:,149,AIC:,642.0
Df Residuals:,145,BIC:,654.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-5.8074,0.660,-8.799,0.000,-7.112,-4.503
x1,0.5738,0.029,20.040,0.000,0.517,0.630
x2,0.7862,0.419,1.875,0.063,-0.043,1.615
x3,3.4832,0.266,13.100,0.000,2.958,4.009

0,1,2,3
Omnibus:,57.775,Durbin-Watson:,2.054
Prob(Omnibus):,0.0,Jarque-Bera (JB):,209.642
Skew:,1.429,Prob(JB):,3e-46
Kurtosis:,8.06,Cond. No.,43.0


## Store the Model

In [20]:
# NAMING SAVED MODELS
model_prefix = 'linear-regression_'
# Get current date for model nameing
current_date = (datetime.date.today().strftime('%Y%m%d'))
# Full model file name
model_name = './models/' + model_prefix + current_date + '.joblib'

In [21]:
# Store the model
joblib.dump(regressor, model_name)

['./models/linear-regression_20190430.joblib']