# RISE Camp Code Snippet

In [1]:
import modin.pandas as pd
import nums
from nums import numpy as nps

from nums.experimental.nums_modin import from_modin

import warnings
warnings.filterwarnings("ignore");

nums.init()

2021-10-29 20:09:28,754	INFO services.py:1265 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Using driver node ip as head node.
head node 10.0.0.4
total cpus 30.0
device_grid (0, 0) 0=node:10.0.0.4/cpu:1


In [2]:
def rmse(actual, expected):
    import numpy as np
    if type(actual) == type(nps.array([])):
        return nps.sqrt(nps.mean((expected - actual) ** 2)).get()
    elif type(actual) == type(np.array([])):
        return np.sqrt(np.mean((expected - actual) ** 2))    

Data being used is dataset precleaned from NOAA GHCN-D dataset using Modin, where it contains worldwide logs of daily climatology data. To manaully preclean the dataset from scratch, look into `datacleaning.ipynb`. This notebook will use only data collected from 2020.

The features of the dataset are:
* day of the year
* temperature maximum
* temperature minimum
* latitude
* longitude
* elevation
* temperature average
* temperature range

The value we are predicting is precipitation, which is measured in tenths of mm. Note that precipitation is not only limited to rain, but also snow, hail, and sleet. 

In [3]:
X_data = pd.read_csv("data/X_2020.csv.gz", compression='gzip')
y_data = pd.read_csv("data/y_2020.csv.gz", compression='gzip')
pd.concat([X_data, y_data], axis=1)

Unnamed: 0,YEAR/MONTH/DAY,TMAX,TMIN,LATITUDE,LONGITUDE,ELEVATION,TAVG,TRANGE,PRCP
63,71,26.9,12.0,25.3330,55.517,34.0,19.45,14.9,0.0
64,72,27.3,12.6,25.3330,55.517,34.0,19.95,14.7,0.0
79,87,31.5,17.5,25.3330,55.517,34.0,24.50,14.0,0.0
80,88,34.5,20.1,25.3330,55.517,34.0,27.30,14.4,0.0
83,91,28.0,18.5,25.3330,55.517,34.0,23.25,9.5,0.0
...,...,...,...,...,...,...,...,...,...
11646292,75,29.8,23.9,19.2833,166.650,4.3,26.85,5.9,0.0
11646408,100,25.8,15.0,-17.9170,31.133,1480.0,20.40,10.8,0.0
11646480,16,24.7,19.0,-20.0670,30.867,1095.0,21.85,5.7,10.0
11646489,67,22.8,14.5,-20.2000,32.616,1132.0,18.65,8.3,10.0


In [4]:
X = from_modin(X_data).astype(float)
y = from_modin(y_data).astype(float).reshape(-1)

## Shuffling and Splitting Dataset

In [5]:
shuffle = nps.random.permutation(X.shape[0])
X = X[shuffle]
y = y[shuffle]
split = int(X.shape[0] * 0.8)
X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

## Linear Regression with NumS (Newtons' Method)

In [17]:
from nums.models.glms import LinearRegression

model = LinearRegression(solver='newton', lr=0.001, max_iter=1, tol=0.001)

In [18]:
%%time
model.fit(X_train, y_train)

CPU times: user 389 ms, sys: 131 ms, total: 520 ms
Wall time: 212 ms


In [19]:
%%time
training_results = model.predict(X_train)

CPU times: user 24.9 ms, sys: 17.4 ms, total: 42.3 ms
Wall time: 15.3 ms


In [20]:
%%time
test_results = model.predict(X_test)

CPU times: user 7.44 ms, sys: 2.09 ms, total: 9.53 ms
Wall time: 5.24 ms


In [21]:
print("Training RMSE:", rmse(training_results, y_train))
print("Testing RMSE", rmse(test_results, y_test))

Training RMSE: 76.22001482376453
Testing RMSE 75.55351760757677


## Sklearn Linear Regression

In [22]:
X_train_np = X_train.get()
X_test_np = X_test.get()
y_train_np = y_train.get()
y_test_np = y_test.get()

In [23]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [24]:
%%time
model.fit(X_train_np, y_train_np)

CPU times: user 766 ms, sys: 71.3 ms, total: 837 ms
Wall time: 816 ms


LinearRegression()

In [25]:
%%time
training_results = model.predict(X_train_np)

CPU times: user 526 ms, sys: 377 ms, total: 902 ms
Wall time: 44.2 ms


In [26]:
%%time
test_results = model.predict(X_test_np)

CPU times: user 386 ms, sys: 679 ms, total: 1.07 s
Wall time: 18.7 ms


In [27]:
print("Training RMSE:", rmse(training_results, y_train_np))
print("Testing RMSE", rmse(test_results, y_test_np))

Training RMSE: 76.22001482376456
Testing RMSE 75.55351760757677


## Logistic Regression with NumS

This time, we will fit and predict the same dataset with logistic regression, but with `y` being binary classification labels (0, 1) to wheter it has precipitated or not.

In [28]:
X = from_modin(pd.read_csv("data/X_2020.csv.gz", compression='gzip')).astype(float)
y = from_modin(pd.read_csv("data/y_bin_2020.csv.gz", compression='gzip')).astype(float).reshape(-1)

In [29]:
shuffle = nps.random.permutation(X.shape[0])
X = X[shuffle]
y = y[shuffle]
split = int(X.shape[0] * 0.8)
X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

In [30]:
from nums.models.glms import LogisticRegression

model = LogisticRegression(solver="newton", penalty="l2", tol=1e-4)

In [31]:
%%time
model.fit(X_train, y_train)

CPU times: user 5.23 s, sys: 1.95 s, total: 7.17 s
Wall time: 4.86 s


In [32]:
%%time
training_results = model.predict(X_train)

CPU times: user 149 ms, sys: 56.4 ms, total: 206 ms
Wall time: 72.4 ms


In [33]:
%%time
test_results = model.predict(X_test)

CPU times: user 38.3 ms, sys: 9.49 ms, total: 47.8 ms
Wall time: 17.8 ms


In [34]:
print("Training accuracy: ", nps.sum(training_results == y_train).get() / y_train.shape[0])
print("Testing accuracy:  ", nps.sum(test_results == y_test).get() / y_test.shape[0])

Training accuracy:  0.7214000876315381
Testing accuracy:   0.7214372483554767


In [35]:
X_train_np = X_train.get()
X_test_np = X_test.get()
y_train_np = y_train.get()
y_test_np = y_test.get()

## Comparison with Sklearn's Logistic Regression

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
model = LogisticRegression()

In [38]:
%%time
model.fit(X_train_np, y_train_np)

CPU times: user 8min 9s, sys: 8min 17s, total: 16min 27s
Wall time: 1min 44s


LogisticRegression()

In [39]:
%%time
training_results = model.predict(X_train_np)

CPU times: user 3.18 s, sys: 4.03 s, total: 7.22 s
Wall time: 155 ms


In [40]:
%%time
test_results = model.predict(X_test_np)

CPU times: user 928 ms, sys: 1.47 s, total: 2.4 s
Wall time: 43.2 ms


In [41]:
print("Training accuracy: ", model.score(X_train_np, y_train_np))
print("Testing accuracy:  ", model.score(X_test_np, y_test_np))

Training accuracy:  0.7125807793553237
Testing accuracy:   0.7124066957498338
