In [9]:
!pip install plotly

Collecting plotly
  Downloading plotly-5.10.0-py2.py3-none-any.whl (15.2 MB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.0.1-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.10.0 tenacity-8.0.1


In [11]:
import pandas as pd # for data manipulation
import numpy as np # for data manipulation
from sklearn.linear_model import LinearRegression # to build a LR model for comparison
import plotly.graph_objects as go # for data visualization
import plotly.express as px # for data visualization 
import statsmodels.api as sm # to build a LOWESS model
from scipy.interpolate import interp1d # for interpolation of new data points

In [12]:
pd.set_option("Max_Columns", None)

In [13]:
df = pd.read_csv("[BRI] Regression Training.csv", low_memory = False)
df.head()

Unnamed: 0,Index,Year of Service,Q2 20 SC,Q2_Incentive,Grade,TOTAL REV_Jul20,Q2 20WM Revenue % Ach,Q2 20CASA Growth % Ach,TOTAL NQC/NTB %Ach_Jul20,TOTAL Mob_Banking %Ach_Jul20,Jul20_WM,Jul20_CS ACH
0,1,9,2.46,87863190.0,11,879401400.0,4.832649,-0.346644,0.0,0.485597,750455620.8,0.26
1,2,10,3.255,78986790.0,12,819238900.0,2.355982,-21.088729,0.5,0.712338,427931769.8,0.74
2,3,5,1.7825,21667960.0,11,779310900.0,0.907896,8.469858,0.0,0.723232,469477058.6,1.86
3,4,4,1.7875,139870300.0,12,2084217000.0,4.331416,-9.162848,1.0,0.562074,824765897.9,0.97
4,5,8,1.9825,14407240.0,11,651402200.0,0.393369,11.763723,0.0,0.460099,426004328.0,1.14


In [14]:
df.shape

(144, 12)

In [15]:
df.columns

Index(['Index', 'Year of Service', 'Q2 20 SC ', 'Q2_Incentive', 'Grade',
       'TOTAL REV_Jul20', 'Q2 20WM Revenue % Ach', 'Q2 20CASA Growth % Ach',
       'TOTAL NQC/NTB %Ach_Jul20', 'TOTAL Mob_Banking %Ach_Jul20', 'Jul20_WM',
       'Jul20_CS ACH'],
      dtype='object')

In [16]:
df.isna().sum()

Index                           0
Year of Service                 0
Q2 20 SC                        0
Q2_Incentive                    0
Grade                           0
TOTAL REV_Jul20                 0
Q2 20WM Revenue % Ach           0
Q2 20CASA Growth % Ach          0
TOTAL NQC/NTB %Ach_Jul20        0
TOTAL Mob_Banking %Ach_Jul20    0
Jul20_WM                        0
Jul20_CS ACH                    0
dtype: int64

In [17]:
df.dropna(axis = 0, how = 'any', inplace = True)

In [18]:
df.isna().sum()

Index                           0
Year of Service                 0
Q2 20 SC                        0
Q2_Incentive                    0
Grade                           0
TOTAL REV_Jul20                 0
Q2 20WM Revenue % Ach           0
Q2 20CASA Growth % Ach          0
TOTAL NQC/NTB %Ach_Jul20        0
TOTAL Mob_Banking %Ach_Jul20    0
Jul20_WM                        0
Jul20_CS ACH                    0
dtype: int64

In [19]:
df.columns

Index(['Index', 'Year of Service', 'Q2 20 SC ', 'Q2_Incentive', 'Grade',
       'TOTAL REV_Jul20', 'Q2 20WM Revenue % Ach', 'Q2 20CASA Growth % Ach',
       'TOTAL NQC/NTB %Ach_Jul20', 'TOTAL Mob_Banking %Ach_Jul20', 'Jul20_WM',
       'Jul20_CS ACH'],
      dtype='object')

In [20]:
df.shape

(144, 12)

In [21]:
# Create a scatter plot
fig = px.scatter(df, x=df['Q2_Incentive'], y=df['Q2 20 SC '], 
                 opacity=0.8, color_discrete_sequence=['black'])

# Change chart background color
fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
fig.update_layout(title=dict(text="Score Card Vs. Incentive Scatter Plot", 
                             font=dict(color='black')))

# Update marker size
fig.update_traces(marker=dict(size=3))

fig.show()

In [22]:
# ------- Select variables -------
# x values for Linear Regression
X=df['Q2_Incentive'].values.reshape(-1,1) # Note, we need X to be a 2D array, hence reshape
# x values for LOWESS
x=df['Q2_Incentive'].values 
# y values for both
y=df['Q2 20 SC '].values


# ------- Linear Regression -------
# Define and fit the model
model1 = LinearRegression()
LR = model1.fit(X, y)

# Predict a few points with Linear Regression model for the grpah
# Create 20 evenly spaced points from smallest X to largest X
x_range = np.linspace(X.min(), X.max(), 20) 
# Predict y values for our set of X values
y_range = model1.predict(x_range.reshape(-1, 1))


# ------- LOWESS -------
# Generate y_hat values using lowess, try a couple values for hyperparameters
lowess = sm.nonparametric.lowess
y_hat1 = lowess(y, x) # note, default frac=2/3
y_hat2 = lowess(y, x, frac=1/5)


invalid value encountered in true_divide



In [23]:
# Create a scatter plot
fig = px.scatter(df, x=df['Q2_Incentive'], y=df['Q2 20 SC '], 
                 opacity=0.8, color_discrete_sequence=['black'])

# Add the prediction line
fig.add_traces(go.Scatter(x=x_range, y=y_range, name='Linear Regression', line=dict(color='limegreen')))
fig.add_traces(go.Scatter(x=y_hat1[:,0], y=y_hat1[:,1], name='LOWESS, frac=2/3', line=dict(color='red')))
fig.add_traces(go.Scatter(x=y_hat2[:,0], y=y_hat2[:,1], name='LOWESS, frac=1/5', line=dict(color='blue')))

# Change chart background color
fig.update_layout(dict(plot_bgcolor = 'white'))

# Update axes lines
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='lightgrey', 
                 zeroline=True, zerolinewidth=1, zerolinecolor='lightgrey', 
                 showline=True, linewidth=1, linecolor='black')

# Set figure title
fig.update_layout(title=dict(text="Score Card Vs. Incentive Scatter Plot", 
                             font=dict(color='black')))

# Update marker size
fig.update_traces(marker=dict(size=3))

fig.show()

In [24]:
from regressors import stats
print("coef_pval:\n", stats.coef_pval(model1, X, y))

# to print summary table:
print("\n=========== SUMMARY ===========")
xlabels = ['Q2_Incentive']
stats.summary(model1, X, y, xlabels)

coef_pval:
 [1.68087766e-13 0.00000000e+00]

Residuals:
    Min      1Q  Median   3Q    Max
-7.2461 -0.1382  0.0828 0.25 3.4316


Coefficients:
              Estimate  Std. Error  t value  p value
_intercept    0.645356    0.079207   8.1478      0.0
Q2_Incentive  0.000000    0.000000  10.8438      0.0
---
R-squared:  0.35499,    Adjusted R-squared:  0.35045
F-statistic: 78.15 on 1 features


In [25]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):

    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)

    print('explained_variance: ', round(explained_variance,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [26]:
# R-square of Linear Regression
y_lin_pred = model1.predict(X)
regression_results(y, y_lin_pred)

explained_variance:  0.355
r2:  0.355
MAE:  0.3537
MSE:  0.6089
RMSE:  0.7803


In [28]:
# R-squared of LOWESS frac=2/3
from scipy.interpolate import interp1d
from sklearn.metrics import r2_score

f_nearest = interp1d(y_hat1[:,0], y=y_hat1[:,1], bounds_error=False, kind='nearest', fill_value='extrapolate')
y_pred1 = f_nearest(x)
regression_results(y, y_pred1)

explained_variance:  0.431
r2:  0.4255
MAE:  0.3203
MSE:  0.5424
RMSE:  0.7365


In [29]:
# R-squared of LOWESS frac=1/5
from scipy.interpolate import interp1d
from sklearn.metrics import r2_score

f_nearest = interp1d(y_hat2[:,0], y=y_hat2[:,1], bounds_error=False, kind='nearest', fill_value='extrapolate')
y_pred2 = f_nearest(x)
print(r2_score(y, y_pred2))

ValueError: Input contains NaN.