In [0]:
%sh /home/ubuntu/databricks/python/bin/pip install https://files.pythonhosted.org/packages/2f/2f/45399c0a3b75d22a6ece1a1732a1670836cf284de7c1f91379a8d9b666a1/gmplot-1.4.1-py3-none-any.whl

In [0]:
import pickle as pkl
from pyspark.sql.functions import col, udf, struct
from pyspark.sql.types import DateType, TimestampType, FloatType, StructType
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, VectorIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import PipelineModel
import numpy as np
import pandas as pd

areas = spark.read.load("/dbfs/FileStore/tables/street_centroids.parquet")
# model = PipelineModel.load("/dbfs/FileStore/tables/maxims_model")

In [0]:
def sigmoid(z):
  z = np.array(z, dtype=np.float64)
  return 1/(1+np.exp(-z))

class OurModel:
  def __init__(self, input_size, output_size, hidden_size1 = 50, hidden_size2 = 50, w1 = None, w2 = None, w3= None):
    self.hidden_size1 = hidden_size1
    self.output_size = output_size
    self.w1 = w1
    if w1 is None:
      self.w1 = np.random.normal(size=(hidden_size1,input_size),scale = 0.3)
    self.w2 = w2
    if w2 is None:
      self.w2 = np.random.normal(size=(hidden_size2,hidden_size1),scale = 0.3)  # 10 like number of possible classifications
    self.w3 = w3
    if w3 is None:
      self.w3 = np.random.normal(size=(output_size,hidden_size2),scale = 0.3)  # 10 like number of possible classifications
    self.loss = 0 # at the start of every epoch should be set to 0

  def forward(self, x):
    x = np.array(x, ndmin=2)
    self.z1 = np.dot(self.w1,x.T)
    self.hidden1 = sigmoid(self.z1) 
    self.z2 = np.dot(self.w2,self.hidden1)
    self.hidden2 = sigmoid(self.z2)
    self.z3 = np.dot(self.w3,self.hidden2)
    y_hat = sigmoid(self.z3)
    return y_hat


  def backward(self, x, y, y_hat, lr = 0.0001):
    X = np.array(x, ndmin=2)
    y = np.array(y, ndmin=2).T
    batch_size = y.shape[1]

    
    # looking for dl_dw3
    dl_dy_hat = (2/batch_size)*(y_hat - y)
    dy_hat_dz3 =  y_hat * (1- y_hat) # this is excatly the gradient of the sigmoid as dsig_dx = sig(x)*(1-sig(x))
    dl_dw3 = np.dot(dl_dy_hat * dy_hat_dz3, self.hidden2.T) # hidden.T is dz_dw2
    
    # looking for dl_dw2
    dl_dh2 = np.dot(self.w3.T, dl_dy_hat)
    dh_dz2 = self.hidden2 * (1-self.hidden2) 
    dl_dz2 = dl_dh2 * dh_dz2
    dl_dw2 = np.dot(dl_dh2 * dh_dz2, self.hidden1.T) 
    
    # looking for dl_dw1
    dl_dh1 = np.dot(self.w2.T, dl_dz2)
    dh_dz1 = self.hidden1 * (1-self.hidden1) 
    dl_dz1 = dl_dh1 * dh_dz1

    # updating the weights accordingly
    self.w1 -= lr * np.dot(dl_dz1, x) # x is dz1_dw1
    self.w2 -= lr * dl_dw2
    self.w3 -= lr * dl_dw3

In [0]:
_w1 = pkl.load(open("/dbfs/FileStore/avi_maxim_models/w1_1labels.pkl",'rb'))
_w2 = pkl.load(open("/dbfs/FileStore/avi_maxim_models/w2_1labels.pkl",'rb'))
_w3 = pkl.load(open("/dbfs/FileStore/avi_maxim_models/w3_1labels.pkl",'rb'))

model = OurModel(input_size = 12, output_size = 1, hidden_size1 = 25, hidden_size2 = 25, w1=_w1, w2 = _w2, w3 = _w3)

In [0]:
get_hour_date = udf(lambda x: str(x)[5:13])

def distance_from_point(long, lat):
  return udf(lambda x: (((x[0] - long) ** 2) + ((x[1] - lat) ** 2)) ** 0.5, FloatType())

format_date = udf(lambda x: f"{str(x[1]).zfill(2)}-{str(x[2]).zfill(2)} {str(x[3]).zfill(2)}")

w = spark.read.option("inferSchema", True).option("header", True).csv("/FileStore/tables/relevant_weather_updated.csv")
w = w.withColumn("hourRounded", get_hour_date(col("date")))
temp = w.select("hourRounded", "rain", "temp", "wetb", "dewpt", "vappr", "rhum", "msl", "vis").groupby("hourRounded").agg({"rain": "avg", "temp":"avg", "wetb":"avg", "dewpt":"avg", "vappr":"avg", "rhum":"avg", "msl":"avg", "vis":"avg"}).withColumnRenamed("avg(temp)", "temp").withColumnRenamed("avg(msl)", "msl").withColumnRenamed("avg(vis)", "vis").withColumnRenamed("avg(rain)", "rain").withColumnRenamed("avg(vappr)", "vappr").withColumnRenamed("avg(rhum)", "rhum").withColumnRenamed("avg(dewpt)", "dewpt").withColumnRenamed("avg(wetb)", "wetb")
# display(temp)

In [0]:
widget_long = dbutils.widgets.text("Longitude", "-6.261582")
widget_lat = dbutils.widgets.text("Latitude", "53.352632")
widget_R = dbutils.widgets.text("Radius of Search", "0.015")
widget_year = dbutils.widgets.text("Year", "2022")
widget_month = dbutils.widgets.text("Month", "2")
widget_day = dbutils.widgets.text("Day", "20")
widget_hour = dbutils.widgets.text("Hour", "17")
widget_minute = dbutils.widgets.text("Minute", "30")
widget_duration = dbutils.widgets.text("Duration", "3")

In [0]:
lat = float(dbutils.widgets.get("Latitude"))
long = float(dbutils.widgets.get("Longitude"))
R = float(dbutils.widgets.get("Radius of Search"))

year = int(dbutils.widgets.get("Year"))
month = int(dbutils.widgets.get("Month"))
day = int(dbutils.widgets.get("Day"))

hour = int(dbutils.widgets.get("Hour"))
minute = int(dbutils.widgets.get("Minute"))
duration = int(dbutils.widgets.get("Duration"))

date_format = f"{year}-{str(month).zfill(2)}-{str(day).zfill(2)} {str(hour).zfill(2)}"

base_df = areas.withColumn("dist", distance_from_point(long, lat)(struct('longitude', 'latitude'))).filter(col("dist") <= R)
# display(base_df)

In [0]:
from pyspark.sql.functions import lit

predict_on = base_df.withColumn("currentHour", lit(hour)).withColumn("currentMinute", lit(minute))
for i in range(1, duration):
  temp_df = base_df.withColumn("currentHour", lit(hour + i)).withColumn("currentMinute", lit(minute))
  predict_on = predict_on.union(temp_df)
predict_on = predict_on.withColumn("currentYear", lit(year)).withColumn("currentMonth", lit(month)).withColumn("currentDay", lit(day)).withColumn("dateType", lit(0))
# display(predict_on)

In [0]:
test = predict_on.withColumn("hourRounded", format_date(struct("currentYear", "currentMonth", "currentDay", "currentHour"))).select("areaId", "currentHour","longitude", "latitude", "hourRounded")
#with_w = test.join(temp, ["hourRounded"]).drop(col("hourRounded")).withColumn("foo", lit(0))
#display(with_w)

X = test.toPandas()
temp_pd = temp.toPandas()
X = X.merge(right=temp_pd, on='hourRounded', how='left').reset_index().drop(['hourRounded','index'],axis=1)
# X.head(5)

In [0]:
X = np.array(X)
# print(X)

In [0]:
y_hat = np.array([1-p for p in model.forward(X)])

In [0]:
import pandas as pd
from pyspark.sql.functions import monotonically_increasing_id

y_hat_df = pd.DataFrame(y_hat.T, columns=["prob"])
df_prob = spark.createDataFrame(y_hat_df, ['prob']).withColumn("id", monotonically_increasing_id())
# display(df_prob)

In [0]:
predict_on = predict_on.withColumn("id", monotonically_increasing_id())
predictions = predict_on.join(df_prob, ["id"])
# display(predictions)

In [0]:
sum_distances = base_df.select(col("dist")).groupby().sum().collect()[0][0]
predictions = predictions.withColumn("dist_norm", col("dist") / lit(sum_distances))
# display(predictions)

In [0]:
# get_proba = udf(lambda x: x.toArray().tolist()[0])

# test_df = predictions.withColumn("prob", get_proba(col('probability'))).withColumn("score", col('dist_norm') * col('prob'))
# display(test_df)

In [0]:
from pyspark.sql.functions import desc

test_df = predictions.withColumn("score", col("dist_norm") * col("prob"))
results = test_df.groupby("areaId").agg({"longitude": "avg", "latitude": "avg", "score": "avg"}).withColumnRenamed("avg(score)", "score").withColumnRenamed("avg(longitude)", "longitude").withColumnRenamed("avg(latitude)", "latitude").orderBy(desc('score'))
display(results)

areaId,score,latitude,longitude
71652,0.0106766796379402,53.34540097888724,-6.274651262514123
71738,0.0105503601652104,53.362470206132535,-6.272584510220925
18272,0.010503862048701,53.34412374152727,-6.249560573881511
73077,0.0104978639684568,53.33926927333578,-6.255489336061201
71751,0.010418707934467,53.364300077914535,-6.270371679863197
71666,0.0102312242175552,53.353307104464335,-6.275879200595242
71665,0.0102117804666856,53.35307558529412,-6.275861040808822
71673,0.0101831324427269,53.34309926905112,-6.272211593720908
73080,0.0101478531525224,53.34182925790935,-6.25237106582915
17915,0.009749834439501,53.34889738817341,-6.274700446114413


In [0]:
import gmplot
import numpy as np

longs = np.array(results.select("longitude").collect())
lats = np.array(results.select("latitude").collect())
w = np.array(results.select("score").collect())
print(len(lats))
print(len(longs))
print(len(w))

In [0]:
gmp = gmplot.GoogleMapPlotter(lat, long, 13, apikey="AIzaSyCEpQ3CCm5b1W_nlBhduL551KefdD9z1fI")
gmp.heatmap(lats, longs, weights=w * 250)
gmp.draw("/dbfs/FileStore/tables/output_map.html")

In [0]:
html_string = gmp.get()
width = 8000
height = 4500
h = f"<iframe srcdoc='{html_string}' width={width} height={height} ></iframe>"
displayHTML(h)