# This notebook contains a demonstration of regression models using tensorflow and dataset
The models are implemented as methods of the class MyBatch in linear.py

In [1]:
import sys
import seaborn

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

sys.path.append("..")
from dataset import Dataset
import linear as lr

In [2]:
%env CUDA_VISIBLE_DEVICES=[]

env: CUDA_VISIBLE_DEVICES=[]


# Linear Regression

![](http://66.147.244.197/~globerov/introspectivemode/wp-content/uploads/2012/08/regression-265x300.jpeg)

Let's consiider a linear regression problem where $y \in \mathbb{R}$ and the relationship can be modeled as
$$y(x) = \langle w, x\rangle$$ where $x \in \mathbb{R}^{d+1}$ - vector consisting of d independent variables concatenated to a vector of ones. To find the solution  $w \in \mathbb{R} ^{d+1}$ we minimize the average sum of squared residuals. In case of $l_2$ regularization the minimized functional looks like:

$$ \frac{1}{N}\sum_{i=1}^N (\langle w, x_i \rangle - y_i) ^ 2 + \dfrac{C}{2}\lVert w \rVert^2  \to \min_w$$
We find the solution using stochastic gradient descent.

In [3]:
BATCH_SIZE = 100
TRAINING_EPOCHS = 500
DATA_SIZE = 500

In [39]:
# lr.load_linear_data returns np.array of feautures and linearly dependent target with error from normal distribution
data = lr.load_linear_data(DATA_SIZE)

# create dataset object
my_dataset = Dataset(index=np.arange(data[0].shape[0]), batch_class=lr.MyBatch)

# create train and test indices
my_dataset.cv_split()

In [40]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
cost_history = []

In [41]:
# create and run train pipeline which loads, preprocesses and trains linear regression model
my_dataset.train.pipeline() \
    .load(data) \
    .preprocess_linear_data() \
    .train_linear(sess, cost_history) \
    .run(BATCH_SIZE, shuffle=True, n_epochs=TRAINING_EPOCHS)

<dataset.dataset.pipeline.Pipeline at 0x1a7002bb518>

In [42]:
# create empty lists to store labels, features, predictions and error
y_true, y_pred, mse, x_features = [], [], [], []

In [43]:
# create and run test pipeline which loads, preprocesses and tests the trained model
test_batch = my_dataset.test.pipeline() \
    .load(data) \
    .preprocess_linear_data() \
    .test_linear(sess, y_true, y_pred, mse, x_features) \
    .run(len(my_dataset.test.indices))

In [44]:
variance = np.var(y_pred, ddof=1) / np.var(y_true, ddof=1)
print('Variance ratio: %.2f' % variance)

Variance ratio: 0.78


In [45]:
mean = np.mean(np.abs(np.array(y_pred) - np.array(y_true)))
interval = 3*np.std(np.abs(np.array(y_pred) - np.array(y_true)))
print('MSE is distibuted in the interval: {0:.2f} $ ± {1:.2f} $'\
        .format(mean, interval))

MSE is distibuted in the interval: 0.11 $ ± 0.28 $


In [46]:
absolute_error_ratio = np.mean(np.abs(np.array(y_pred) - np.array(y_true))/np.array(y_true))*100
print('MAE with respect to data\'s mean is: {0:.2f}%'\
        .format(absolute_error_ratio))

MAE with respect to data's mean is: 2.92%


# Logistic Regression

Logistic regression is used for binary classification problem. In case of $y \in \{-1, 1\}$ the model looks like $y = sign \langle w, x\rangle$ and the minimized functional is:
$$ \dfrac{1}{N}\sum_{i=1}^N \log(1 + \exp(-\langle w, x_i \rangle y_i)) + \dfrac{C}{2}\lVert w \rVert^2  \to \min_w$$


In [20]:
# create random data for classification
data = lr.load_random_data(blobs=False)

# create dataset object
my_dataset = Dataset(index=np.arange(data[0].shape[0]), batch_class=lr.MyBatch)

# create train and test indices
my_dataset.cv_split()

In [21]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
cost_history = []
acc_history = []

In [22]:
# create and run train pipeline which loads, preprocesses and trains logistic regression model
my_dataset.train.pipeline() \
    .load(data) \
    .preprocess_binary_data() \
    .train_logistic(sess, cost_history, acc_history) \
    .run(BATCH_SIZE, shuffle=True, n_epochs=TRAINING_EPOCHS)

<dataset.dataset.pipeline.Pipeline at 0x1a7002bb828>

In [25]:
accuracy = []

In [26]:
# create and run test pipeline which loads, preprocesses and tests the trained model
test_batch = my_dataset.test.pipeline() \
    .load(data) \
    .preprocess_binary_data() \
    .test_logistic(sess, accuracy) \
    .run(len(my_dataset.test.indices))

In [27]:
print("ACCURACY: %.0f%%" % (100.0 * accuracy[0]))

ACCURACY: 95%


# Poisson Regression

Poisson regression is used to model count data. It assumes the target variable Y has a Poisson distribution. The model takes the form: $$\log \operatorname {E} (\mathrm{Y}\mid x )=\langle w, x \rangle \,$$ 
and the minimized functional looks like:
$$ \sum_{i=1}^N y_i \langle w, x_i \rangle - \exp{\langle w, x_i \rangle} + \dfrac{C}{2}\lVert w \rVert^2 \to \min_w$$

In [47]:
# lr.load_poisson_data generates sample from poisson distribution and returns a tuple of weights, features and labels
data = lr.load_poisson_data()

# create dataset object
my_dataset = Dataset(index=np.arange(data[1].shape[0]), batch_class=lr.MyBatch)

# create train and test indices
my_dataset.cv_split()

In [48]:
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
cost_history = []

In [49]:
# create and run train pipeline which loads, preprocesses and trains poisson regression model
my_dataset.train.pipeline() \
    .load(data[1:]) \
    .train_poisson(sess, cost_history) \
    .run(BATCH_SIZE, shuffle=True, n_epochs=TRAINING_EPOCHS)

<dataset.dataset.pipeline.Pipeline at 0x1a700371470>

In [50]:
logit, y_true, weights = [], [], []

In [51]:
test_batch = my_dataset.test.pipeline() \
    .load(data[1:]) \
    .test_poisson(sess, y_true, logit, weights) \
    .run(len(my_dataset.test.indices))

In [52]:
variance_ratio = np.var(logit, ddof=1) / np.var(np.dot(data[1], data[0]), ddof=1)
print('Variance ratio: %.2f' % (variance_ratio))

Variance ratio: 0.86


In [53]:
abs_error_ratio = np.mean(np.abs(np.exp(logit) - y_true))/np.mean(y_true)*100 
print('MAE of the model with respect to data\'s mean is: {0:.2f}%'\
        .format(abs_error_ratio))

MAE of the model with respect to data's mean is: 15.54%
