# RISE Camp Code Snippet

In [1]:
import modin.pandas as pd
import nums
from nums import numpy as nps

from nums.experimental.nums_modin import from_modin

import warnings
warnings.filterwarnings("ignore");

nums.init()

Automatically increasing RLIMIT_NOFILE to max value of 9223372036854775807


2021-10-29 09:21:24,563	INFO services.py:1265 -- View the Ray dashboard at [1m[32mhttp://127.0.0.1:8265[39m[22m


Using driver node ip as head node.
head node 192.168.17.182
total cpus 6.0
device_grid (0, 0) 0=node:192.168.17.182/cpu:1


In [2]:
def rmse(actual, expected):
    import numpy as np
    if type(actual) == type(nps.array([])):
        return nps.sqrt(nps.mean((expected - actual) ** 2)).get()
    elif type(actual) == type(np.array([])):
        return np.sqrt(np.mean((expected - actual) ** 2))    

Data being used is dataset precleaned from NOAA GHCN-D dataset, where it contains worldwide logs of daily climatology data. The dataset has been precleaned with modin and the one below contains data from the year 2020.

The features of the data set are:

Day of the year, temperature maximum, temperature minimum, latitude, longitude, elevation, temperature average, temperature range. The value we are predicting is precipitation, which is measured in tenths of mm. Precipitation is not only limited to rain, but also snow, hail, and sleet. 

In [3]:
X_data = pd.read_csv("data/X_2020.csv.gz", compression='gzip')
y_data = pd.read_csv("data/y_2020.csv.gz", compression='gzip')
pd.concat([X_data, y_data], axis=1)

Unnamed: 0,YEAR/MONTH/DAY,TMAX,TMIN,LATITUDE,LONGITUDE,ELEVATION,TAVG,TRANGE,PRCP
63,71,26.9,12.0,25.3330,55.517,34.0,19.45,14.9,0.0
64,72,27.3,12.6,25.3330,55.517,34.0,19.95,14.7,0.0
79,87,31.5,17.5,25.3330,55.517,34.0,24.50,14.0,0.0
80,88,34.5,20.1,25.3330,55.517,34.0,27.30,14.4,0.0
83,91,28.0,18.5,25.3330,55.517,34.0,23.25,9.5,0.0
...,...,...,...,...,...,...,...,...,...
11646292,75,29.8,23.9,19.2833,166.650,4.3,26.85,5.9,0.0
11646408,100,25.8,15.0,-17.9170,31.133,1480.0,20.40,10.8,0.0
11646480,16,24.7,19.0,-20.0670,30.867,1095.0,21.85,5.7,10.0
11646489,67,22.8,14.5,-20.2000,32.616,1132.0,18.65,8.3,10.0


In [4]:
X = from_modin(X_data).astype(float)
y = from_modin(y_data).astype(float).reshape(-1)

## Shuffling and Splitting Dataset

In [5]:
shuffle = nps.random.permutation(X.shape[0])
X = X[shuffle]
y = y[shuffle]
split = int(X.shape[0] * 0.8)
X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

## Linear Regression with NumS (Newtons' Method)

In [6]:
from nums.models.glms import LinearRegression

model = LinearRegression(solver='newton', lr=0.001, max_iter=100000, tol=0.001)

In [7]:
%%time
model.fit(X_train, y_train)

CPU times: user 806 ms, sys: 399 ms, total: 1.2 s
Wall time: 2.56 s


In [8]:
%%time
training_results = model.predict(X_train)

CPU times: user 9.11 ms, sys: 1.89 ms, total: 11 ms
Wall time: 8.59 ms


In [9]:
%%time
test_results = model.predict(X_test)

CPU times: user 7.97 ms, sys: 1.91 ms, total: 9.88 ms
Wall time: 6.25 ms


In [10]:
print("Training RMSE:", rmse(training_results, y_train))
print("Testing RMSE", rmse(test_results, y_test))

Training RMSE: 76.30091285924962
Testing RMSE 75.22628540243723


## Sklearn Linear Regression

In [23]:
X_train_np = X_train.get()
X_test_np = X_test.get()
y_train_np = y_train.get()
y_test_np = y_test.get()

In [24]:
from sklearn.linear_model import Ridge
model = Ridge(solver='lbfgs')

In [25]:
from sklearn.linear_model import SGDRegressor

model = SGDRegressor(penalty='l2', max_iter=100000, tol=0.001)

#from sklearn.linear_model import LinearRegression
#model = LinearRegression()

In [26]:
%%time
model.fit(X_train_np, y_train_np)

CPU times: user 6min 8s, sys: 3.75 s, total: 6min 12s
Wall time: 6min 5s


SGDRegressor(max_iter=100000)

In [27]:
%%time
training_results = model.predict(X_train_np)

CPU times: user 144 ms, sys: 28.2 ms, total: 172 ms
Wall time: 37.5 ms


In [28]:
%%time
test_results = model.predict(X_test_np)

CPU times: user 91.4 ms, sys: 41.7 ms, total: 133 ms
Wall time: 17.1 ms


In [29]:
print("Training RMSE:", rmse(training_results, y_train_np))
print("Testing RMSE", rmse(test_results, y_test_np))

Training RMSE: 16890144442524.832
Testing RMSE 16902401665512.902


## Logistic Regression with NumS

This time, we will fit and predict the same dataset with logistic regression, but with `y` being binary classification labels (0, 1) to wheter it has precipitated or not.

In [30]:
X = from_modin(pd.read_csv("data/X_2020.csv.gz", compression='gzip')).astype(float)
y = from_modin(pd.read_csv("data/y_bin_2020.csv.gz", compression='gzip')).astype(float).reshape(-1)

In [31]:
shuffle = nps.random.permutation(X.shape[0])
X = X[shuffle]
y = y[shuffle]
split = int(X.shape[0] * 0.8)
X_train = X[:split]
X_test = X[split:]
y_train = y[:split]
y_test = y[split:]

In [32]:
from nums.models.glms import LogisticRegression

model = LogisticRegression(solver="newton", penalty="l2", tol=1e-4)

In [33]:
%%time
model.fit(X_train, y_train)

CPU times: user 1.87 s, sys: 935 ms, total: 2.8 s
Wall time: 3.23 s


In [34]:
%%time
training_results = model.predict(X_train)

CPU times: user 27.3 ms, sys: 11.8 ms, total: 39.1 ms
Wall time: 24.6 ms


In [35]:
%%time
test_results = model.predict(X_test)

CPU times: user 15.2 ms, sys: 8.02 ms, total: 23.2 ms
Wall time: 12.7 ms


In [36]:
print("Training accuracy: ", nps.sum(training_results == y_train).get() / y_train.shape[0])
print("Testing accuracy:  ", nps.sum(test_results == y_test).get() / y_test.shape[0])

Training accuracy:  0.7213672353548561
Testing accuracy:   0.7216863143579675


In [37]:
split = int(X.shape[0] * 0.8)
X_train_np = X_train.get()
X_test_np = X_test.get()
y_train_np = y_train.get()
y_test_np = y_test.get()

## Comparison with Sklearn's Logistic Regression

In [38]:
from sklearn.linear_model import LogisticRegression

In [39]:
model = LogisticRegression()

In [40]:
%%time
model.fit(X_train_np, y_train_np)

CPU times: user 3min 16s, sys: 37.8 s, total: 3min 54s
Wall time: 1min 11s


LogisticRegression()

In [41]:
%%time
training_results = model.predict(X_train_np)

CPU times: user 833 ms, sys: 172 ms, total: 1.01 s
Wall time: 135 ms


In [42]:
%%time
test_results = model.predict(X_test_np)

CPU times: user 232 ms, sys: 57.8 ms, total: 289 ms
Wall time: 36.8 ms


In [43]:
print("Training accuracy: ", model.score(X_train_np, y_train_np))
print("Testing accuracy:  ", model.score(X_test_np, y_test_np))

Training accuracy:  0.7123336232737753
Testing accuracy:   0.7125121285975139
