In [None]:
import pandas as pd
import ipywidgets as widgets
import numpy as np
from scipy import stats
import statsmodels.formula.api as smf

# Read Data

In [None]:
df_raw = pd.read_csv('../data/transformed_data_raw.csv')

# COMMAND ----------

# Read Data
df_view = pd.read_csv('../data/transformed_data_for_viewing.csv')

# COMMAND ----------

# Hide last 200 observations for testing 
df_unseen = df_raw[-200:].reset_index()
df = df_raw[:-200].reset_index()

In [None]:
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.replace('\[', '')
df.columns = df.columns.str.replace('\]', '')


### Specify variables

In [None]:
variables =  ['Occupation', 'Family_Type', 'Family_Interest',
       'House_Type', 'Number_Of_Residents', 'Average_Age',
       'Distance_To_Nearest_Tower_m', 'Number_Of_Phones',
       'Number_Of_Computers', 'Number_Of_Tvs', 'Number_Of_Pets',
       'Customer_Happiness', 'Time_Spend_On_YouTube_min',
       'Time_Spend_On_TikTok_min', 'Time_Spend_On_Instagram_min',
       'Time_Spend_On_Spotify_min', 'Size_of_home_m2']

## Make sliders

In [None]:
slider_min = -10
slider_max = 10
slider_value = (slider_max+slider_min)/2

In [None]:
layout = widgets.Layout(width='auto', height='40px') #set width and height


In [None]:
sliders = { 
  i : widgets.IntSlider(
    min=slider_min,
    max=slider_max,
    step=1,
    description=i.replace("_", " ") + ': ',
    value=slider_value,
    layout=widgets.Layout(width='40%'),
    style= {'description_width': '40%'}
  ) for i in variables
}

In [None]:
for s in sliders.values():
  display(s)

In [None]:
### Store sliders in Dataframe

# button = widgets.Button(description="Save weights")
# output = widgets.Output()

# display(button, output)

# def on_button_clicked(b):
#   #Get slider values as pandas df
#   df_weights = pd.DataFrame(columns=['variable', 'weight'])
#   counter = 0
#   for i in variables: 
#     new_name =  i.replace(' ','_')
#     df_weights.loc[counter,'variable'] = i
#     df_weights.loc[counter,'weight']= globals()[f'slider_{new_name}'].value
#     counter +=1 

#   with output:
#       print("Weights Saved")

# button.on_click(on_button_clicked)

In [None]:
# Add save button for weights and store data
get_data_button = widgets.Button(description='Save Weights')
output = widgets.Output()

def get_data(b):

  #Get slider values as pandas df
  tmp_weights = pd.DataFrame(columns=['variable', 'weight'])
  counter = 0
  with output:
    print("Weights Saved")

  for var_name, slider in sliders.items(): 
    tmp_weights.loc[counter,'variable'] = var_name
    tmp_weights.loc[counter,'weight']= slider.value
    counter +=1 
    get_data.data = tmp_weights

  print(get_data.data)

  return get_data.data

In [None]:
get_data_button.on_click(get_data)
display(get_data_button, output)

In [None]:
# Store weights as dataframe
df_weights = get_data.data
df_weights

## Create manual linear regression using user input

In [None]:

# Normalize weights between -1 and 1
df_weights['weight'] = df_weights['weight']/10

In [None]:
# standardizing dataframe so coefficients are -1 and 1
df_z = df.select_dtypes(include=[np.number]).dropna().apply(stats.zscore)


In [None]:

# Store mean and std to transform back 
mean_std={}
for var in df.columns:
    mean_std[var]=(df[var].mean(), df[var].std())

In [None]:
def reverse_zscore(pandas_series, mean, std):
    '''Mean and standard deviation should be of original variable before standardization'''
    yis=pandas_series*std+mean
    return yis

var = 'Mobile_Traffic'
original_mean, original_std = mean_std[var]
original_var_series = reverse_zscore(df_z[var], original_mean, original_std)


In [None]:
# Manually calculate y = alpha * x + beta
Y = df_z['Mobile_Traffic']
X = df_z[variables].copy()
for i in variables:
  alpha = df_weights[df_weights['variable'] == i].weight.values[0]
  X[i] = df_z[i] * alpha 
Y_pred = X.sum(axis=1)

Y_actual = df['Mobile_Traffic']
Y_pred_trans = reverse_zscore(Y_pred, original_mean, original_std)


In [None]:
# # plot 
# import matplotlib.pyplot as plt

# plt.scatter(Y_pred_trans,Y_actual)
# plt.plot([0, max(max(Y_actual), max(Y_pred_trans))], [0,max(max(Y_actual), max(Y_pred_trans))], 'red')
# plt.title("Mobile Traffic - Manual Model")
# plt.ylabel("Actual")
# plt.xlabel("Predicted")
# plt.show()

# Evaluate model

In [None]:
# Use MAE because it's the easiest to understand ? 
get_data_button = widgets.Button(description='Evaluate your model')
output = widgets.Output()

def get_data(b):
  mape = round(np.mean(np.abs((Y_actual - Y_pred_trans)/Y_actual))*100,2)
  mae = round(sum(abs(Y_actual - Y_pred_trans))/len(Y_actual),2)
  with output:
    print("Mean Absolute Error: " + str(mae))
    print("On average your model predicts the mobile traffic to be" + str(mae) + " GB off from the actual value")
    print("That corresponds to " + str(mape) + "% off the actual value on average" )
    print("")
    if mae < 2.3:
      print("Good job! Can you beat your own record?")
    if mae >= 5:
      print("Try again! You can do better")


  return 0 

In [None]:
# DISPLAY BUTTON
get_data_button.on_click(get_data)
display(get_data_button, output)

In [None]:
%matplotlib inline
# MAGIC 
from matplotlib.pyplot import *
layout = widgets.Layout(width='auto', height='40px') #set width and height
# MAGIC 
button = widgets.Button(description="Plot your model")
out = widgets.Output()
# MAGIC 
def on_button_clicked(b):
    with out:
        Y_actual = df['Mobile_Traffic']
        Y_pred_trans = reverse_zscore(Y_pred, original_mean, original_std)
        plt.scatter(Y_pred_trans,Y_actual)
        plt.plot([0, max(max(Y_actual), max(Y_pred_trans))], [0,max(max(Y_actual), max(Y_pred_trans))], 'red')
        plt.title("Mobile Traffic - Manual Model")
        plt.ylabel("Actual")
        plt.xlabel("Predicted")
        plt.show()
        # show()
button.on_click(on_button_clicked)
# MAGIC 
display(button)
# MAGIC 
with out:
    print("")
    # plot([1,2],[1,2])
    # show()
out

## Make linear regression

In [None]:

#Create formula
# Remove Intercept
formula ='Mobile_Traffic ~ -1 + '
counter = 1
for i in  X.columns.values:
  if counter == 1:
    formula = formula + i
  else:
    formula = formula + ' + ' + i
  counter += 1

In [None]:
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
res = smf.ols(formula, data=df_z).fit()
lm_prediction = res.predict(df_z[:])
res.summary()

In [None]:
import matplotlib.pyplot as plt

lm_prediction_trans = reverse_zscore(lm_prediction, original_mean, original_std)

plt.scatter(lm_prediction_trans, Y_actual)
plt.plot([0, max(max(Y_actual), max(lm_prediction_trans))], [0,max(max(Y_actual), max(lm_prediction_trans))], 'red')
plt.title("Mobile Traffic - Linear Model")
plt.ylabel("Actual")
plt.xlabel("Predicted")
plt.show()


In [None]:
# Show case Non Linear relationship
plt.scatter(df['Distance_To_Nearest_Tower_m'], Y_actual)
plt.title("Mobile Traffic - Linear Model")
plt.ylabel("Mobile Traffic")
plt.xlabel("-")
plt.show()

In [None]:
# Example 3d plot
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d

model = smf.ols(formula='Mobile_Traffic ~ Average_Age + Number_Of_Phones', data=df)

results = model.fit()
x, y = model.exog_names[1:]

x_range = np.arange(df[x].min(), df[x].max())
print(x_range.shape)
y_range = np.arange(df[y].min(), df[y].max())
print(y_range.shape)

X, Y = np.meshgrid(x_range, y_range)

exog = pd.DataFrame({x: X.ravel(), y: Y.ravel()})
Z = results.predict(exog = exog).values.reshape(X.shape)
y_pred = results.predict(df[model.exog_names[1:]])

fig = plt.figure(figsize=plt.figaspect(1)*2)
ax = plt.axes(projection='3d')
ax.scatter(df[x].values, df[y].values, df[model.endog_names].values, label="Actual")
ax.scatter(df[x].values, df[y].values, y_pred, label="Pred")

ax.plot_surface(X, Y, Z, rstride=1, cstride=1, alpha = 0.4)
ax.legend()
plt.show()

In [None]:
# from mpl_toolkits.mplot3d import Axes3D
# from matplotlib import cm
# from matplotlib.ticker import LinearLocator, FormatStrFormatter
# import matplotlib.pyplot as plt
# import numpy as np

# fig = plt.figure()
# ax = fig.gca(projection='3d')
# X = np.arange(-5, 5, 0.25)
# Y = np.arange(-5, 5, 0.25)
# X, Y = np.meshgrid(X, Y)
# R = np.sqrt(X**2 + Y**2)
# Z = np.sin(R)
# surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=cm.coolwarm,
#                        linewidth=0, antialiased=False)
# ax.set_zlim(-1.01, 1.01)

# ax.zaxis.set_major_locator(LinearLocator(10))
# ax.zaxis.set_major_formatter(FormatStrFormatter('%.02f'))

# fig.colorbar(surf, shrink=0.5, aspect=5)

# plt.show()