In [1]:
# import dependencies
# Python SQL toolkit and Object Relational Mapper
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [2]:
# import the dataset
file_path = '../Resources/work_hr.csv'
work_df = pd.read_csv(file_path)
work_df.head()

Unnamed: 0,date_created,time_created,values,ranges,day_sections,day_of_week
0,2022-03-28,18:44:22,96.0,normal,0,0
1,2022-03-28,18:45:04,105.0,normal,0,0
2,2022-03-28,18:45:09,98.0,normal,0,0
3,2022-03-28,18:45:14,100.0,normal,0,0
4,2022-03-28,18:45:19,100.0,normal,0,0


In [41]:
# Create 8 bins, spaced evenly
# get maximum lat, min lat
max_values = work_df['values'].max()
min_values = work_df['values'].min()
# get the difference and divide by the number of bins
values_spacing = (max_values - min_values)/4
values_bins = [1, 2, 3, 4]

#build bins
bins = []
for x in np.arange(min_values, max_values, values_spacing):
    bins.append(x)

# make first and last values larger/smaller to catch all
bins.append(205)
bins[0] = 40

bins

[40, 84.5, 124.0, 163.5, 205]

In [42]:
work_df["values_bins"] = pd.cut(work_df["values"], bins, labels=values_bins)
work_df["values_bins"].value_counts()

1    5819
2    5014
3     262
4     199
Name: values_bins, dtype: int64

In [43]:
ml_df = work_df[['values_bins', 'day_sections', 'day_of_week']]

In [44]:
target_df = ml_df[['values_bins']]
feature_df = ml_df[['day_sections', 'day_of_week']]

In [45]:
X = feature_df
y = target_df

In [46]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Linear Regression

In [47]:
# create a linear model
model_regression = LinearRegression()

In [48]:
# fit our model to the training data
model_regression.fit(X_train, y_train)

LinearRegression()

In [49]:
model_regression.score(X_test, y_test)

0.11422502366651788

## Random Forest Classifier

In [50]:
# Calculate feature importance in the Random Forest model.
model_rfc = RandomForestClassifier(random_state=1, n_estimators=100)
model_rfc.fit(X_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(random_state=1)

In [51]:
print(f'Training Score: {model_rfc.score(X_train, y_train)}')
print(f'Testing Score: {model_rfc.score(X_test, y_test)}')

Training Score: 0.6429752066115703
Testing Score: 0.6441218130311614


In [52]:
feature_importances = model_rfc.feature_importances_
feature_importances

array([0.3360035, 0.6639965])

In [53]:
y_pred = model_rfc.predict(X_test)
y_pred

array([1, 2, 1, ..., 2, 2, 1], dtype=int64)

In [54]:
comparison_pd = X_test

In [55]:
comparison_pd['predicted_values'] = y_pred

In [56]:
comparison_pd = comparison_pd.join(y_test)

In [57]:
comparison_pd.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2824 entries, 3784 to 8612
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   day_sections      2824 non-null   int64   
 1   day_of_week       2824 non-null   int64   
 2   predicted_values  2824 non-null   int64   
 3   values_bins       2824 non-null   category
dtypes: category(1), int64(3)
memory usage: 155.7 KB


In [58]:
comparison_pd[['predicted_values', 'values_bins']]

Unnamed: 0,predicted_values,values_bins
3784,1,1
4643,2,2
11253,1,1
1923,1,1
1839,1,1
...,...,...
4170,2,2
9723,1,1
2110,2,2
2375,2,1


In [59]:
print(comparison_pd['predicted_values'].value_counts())
print(comparison_pd['values_bins'].value_counts())

2    1447
1    1377
Name: predicted_values, dtype: int64
1    1451
2    1259
3      62
4      52
Name: values_bins, dtype: int64
