In [54]:
from __future__ import print_function
from builtins import range

import numpy as np
from sklearn.base import BaseEstimator, clone
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
# from scipy.spatial.distance import cdist
from sklearn.metrics.pairwise import pairwise_distances as cdist
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from clr import best_clr


class CLRcRegressor(BaseEstimator):
  def __init__(self, num_planes, kmeans_coef, constr_id,
               num_tries=1, clr_lr=None, max_iter=5):
    self.num_planes = num_planes
    self.kmeans_coef = kmeans_coef
    self.num_tries = num_tries
    self.constr_id = constr_id
    self.clr_lr = clr_lr
    self.max_iter = max_iter

  def fit(self, X, y, init_labels=None,
          seed=None, verbose=False):
    if seed is not None:
      np.random.seed(seed)

    constr = np.empty(X.shape[0], dtype=np.int)
    for i, c_id in enumerate(np.unique(X[:, self.constr_id])):
      constr[X[:, self.constr_id] == c_id] = i

    self.labels_, self.models_, _, _ = best_clr(
      X, y, k=self.num_planes, kmeans_X=self.kmeans_coef,
      constr=constr, max_iter=self.max_iter, num_tries=self.num_tries,
      lr=self.clr_lr,
    )
    # TODO: optimize this
    self.constr_to_label = {}
    for i in range(X.shape[0]):
      self.constr_to_label[X[i, self.constr_id]] = self.labels_[i]

  def init_fit(self, labels, models, constr_to_label):
    self.labels_ = labels
    self.models_ = models
    self.constr_to_label = constr_to_label

  def predict(self, X, test_constr=None):
    check_is_fitted(self, ['labels_', 'models_'])

    if test_constr is None:
      test_constr = X[:, self.constr_id]

    # TODO: optimize this
    test_labels = np.zeros(X.shape[0], np.int)
    for i in range(X.shape[0]):
      test_labels[i] = self.constr_to_label[test_constr[i]]

    preds = np.empty(X.shape[0])
    for cl_idx in range(self.num_planes):
      if np.sum(test_labels == cl_idx) == 0:
        continue
      y_pred = self.models_[cl_idx].predict(X[test_labels == cl_idx])
      preds[test_labels == cl_idx] = y_pred
    return preds


class FuzzyCLRRegressor(BaseEstimator):
  def __init__(self, num_planes, kmeans_coef,
               clr_lr=None, num_tries=1):
    self.num_planes = num_planes
    self.kmeans_coef = kmeans_coef
    self.num_tries = num_tries
    self.clr_lr = clr_lr

  def fit(self, X, y, init_labels=None, max_iter=20,
          seed=None, verbose=False):
    if seed is not None:
      np.random.seed(seed)
    self.labels_, self.models_, self.weights_, _ = best_clr(
      X, y, k=self.num_planes, kmeans_X=self.kmeans_coef,
      max_iter=max_iter, num_tries=self.num_tries,
      lr=self.clr_lr, fuzzy=True
    )
    self.X_ = X

  def predict(self, X):
    check_is_fitted(self, ['labels_', 'models_', 'weights_'])

    preds = np.empty((X.shape[0], self.num_planes))
    for cl_idx in range(self.num_planes):
      preds[:, cl_idx] = self.models_[cl_idx].predict(X)
    preds = np.sum(preds * self.weights_, axis=1)
    return preds


class CLRpRegressor(BaseEstimator):
  def __init__(self, num_planes, kmeans_coef, clr_lr=None, max_iter=5,
               num_tries=1, clf=None, weighted=False, fuzzy=False):
    self.num_planes = num_planes
    self.kmeans_coef = kmeans_coef
    self.num_tries = num_tries
    self.weighted = weighted
    self.clr_lr = clr_lr
    self.fuzzy = fuzzy
    self.max_iter = max_iter

    if clf is None:
      self.clf = RandomForestClassifier(n_estimators=20)
    else:
      self.clf = clf

  def fit(self, X, y, init_labels=None,
          seed=None, verbose=False):
    if seed is not None:
      np.random.seed(seed)
    self.labels_, self.models_, _, _ = best_clr(
      X, y, k=self.num_planes, kmeans_X=self.kmeans_coef,
      max_iter=self.max_iter, num_tries=self.num_tries,
      lr=self.clr_lr, fuzzy=self.fuzzy
    )
    self.X_ = X
    if verbose:
      label_score = self.get_label_score_()
      print("Label prediction: {:.6f} +- {:.6f}".format(
        label_score.mean(), label_score.std()))
    if np.unique(self.labels_).shape[0] == 1:
      self.labels_[0] = 1 if self.labels_[0] == 0 else 0
    self.clf.fit(X, self.labels_)

  def init_fit(self, X, labels, models):
    self.labels_ = labels
    self.models_ = models
    self.X_ = X
    self.clf.fit(X, self.labels_)

  def get_label_score_(self):
    return cross_val_score(self.clf, self.X_, self.labels_, cv=3).mean()

  def predict(self, X):
    check_is_fitted(self, ['labels_', 'models_'])

    if self.weighted:
      if 'n_classes_' in self.clf.__dict__ and self.clf.n_classes_ == self.num_planes:
        planes_probs = self.clf.predict_proba(X)
      else:
        planes_probs = np.zeros((X.shape[0], self.num_planes))
        planes_probs[:, self.clf.classes_] = self.clf.predict_proba(X)
      preds = np.empty((X.shape[0], self.num_planes))
      for cl_idx in range(self.num_planes):
        preds[:, cl_idx] = self.models_[cl_idx].predict(X)
      preds = np.sum(preds * planes_probs, axis=1)
    else:
      test_labels = self.clf.predict(X)
      preds = np.empty(X.shape[0])
      for cl_idx in range(self.num_planes):
        if np.sum(test_labels == cl_idx) == 0:
          continue
        y_pred = self.models_[cl_idx].predict(X[test_labels == cl_idx])
        preds[test_labels == cl_idx] = y_pred
    return preds


class KPlaneLabelPredictor(BaseEstimator):
  def __init__(self, num_planes, weight_mode='kplane'):
    self.num_planes = num_planes
    self.n_classes_ = num_planes
    self.weight_mode = weight_mode

  def fit(self, X, y):
    if self.weight_mode == 'size':
      self.weights = np.empty(self.num_planes)
      for cl in range(self.num_planes):
        self.weights[cl] = np.sum(y == cl)
      self.weights /= np.sum(self.weights)
    else:
      self.centers_ = np.empty((self.num_planes, X.shape[1]))
      for cl in range(self.num_planes):
        if np.sum(y == cl) == 0:
          # filling with inf empty clusters
          self.centers_[cl] = np.ones(X.shape[1]) * 1e5
          continue
        self.centers_[cl] = np.mean(X[y == cl], axis=0)

  def predict(self, X):
    if self.weight_mode == 'size':
      probs = self.predict_proba
      return np.argmax(probs)
    dst = cdist(self.centers_, X)
    return np.argmin(dst, axis=0)

  def predict_proba(self, X):
    if self.weight_mode == 'size':
      return self.weights
    dst = cdist(self.centers_, X)
    return dst.T / np.sum(dst.T, axis=1, keepdims=True)

  def score(self, X, y):
    return np.mean(self.predict(X) == y)


class KPlaneRegressor(CLRpRegressor):
  def __init__(self, num_planes, kmeans_coef, fuzzy=False, max_iter=5,
               num_tries=1, weighted=False, clr_lr=None):
    weighted_param = True if weighted == 'size' else weighted
    super(KPlaneRegressor, self).__init__(
      num_planes, kmeans_coef,
      num_tries=num_tries, fuzzy=fuzzy, max_iter=max_iter,
      clf=KPlaneLabelPredictor(num_planes, weight_mode=weighted),
      weighted=weighted_param, clr_lr=clr_lr,
    )


class RegressorEnsemble(BaseEstimator):
  def __init__(self, rgr, n_estimators=10):
    self.rgr = rgr
    self.n_estimators = n_estimators
    self.rgrs = []
    for i in range(self.n_estimators):
      self.rgrs.append(clone(self.rgr))

  def fit(self, X, y, init_labels=None,
          seed=None, verbose=False):
    if seed is not None:
      np.random.seed(seed)
    for i in range(self.n_estimators):
      self.rgrs[i].fit(X, y, init_labels, verbose=verbose)

  def predict(self, X):
    ans = np.zeros(X.shape[0])
    for i in range(self.n_estimators):
      ans += self.rgrs[i].predict(X)
    return ans / len(self.rgrs)



In [295]:
import pandas as pd

df = pd.read_excel('data/cluster_test.xlsx') #you could add index_col=0 if there's an index
# x=[]
X=df['yi']
Y=df['ai1']
# x
# best_clr(X.values.reshape(-1, 1),Y,3)

In [296]:
X

0      960
1      830
2     1260
3      610
4      590
5      900
6      820
7      880
8      860
9      760
10    1020
11    1080
12     960
13     700
14     800
15     113
16     760
17     740
18     980
19     800
Name: yi, dtype: int64

In [None]:
from sklearn.linear_model import Ridge, LinearRegression

k=3
y=Y
x=df['yi']
X=x.values.reshape(-1, 1)
max_iter=5
labels=None
const=None
verbose =None
kmeans_X=0.0

def reassign_labels(scores, constr):
  if constr is None:
    return np.argmin(scores, axis=1)
  labels = np.empty(scores.shape[0], dtype=np.int)
# TODO: make faster?
  for c_id in range(constr.max() + 1):
    labels[constr == c_id] = np.argmin(np.mean(scores[constr == c_id], axis=0))
  return labels

if labels is None:
    labels = np.random.choice(k, size=X.shape[0])

lr=Ridge(alpha=1e-5)
models = [clone(lr) for i in range(k)]
scores = np.empty((X.shape[0], k))
preds = np.empty((X.shape[0], k))


In [305]:

# for it in range(max_iter):
# rebuild models
for cl_idx in range(k):
  if np.sum(labels == cl_idx) == 0:
    continue
  models[cl_idx].fit(X[labels == cl_idx], y[labels == cl_idx])
    # reassign points
for cl_idx in range(k):
  preds[:, cl_idx] = models[cl_idx].predict(X)
  scores[:, cl_idx] = (y - preds[:, cl_idx]) ** 2

#   if kmeans_X > 0:
#     center = np.mean(X[labels == cl_idx], axis=0)
#     scores[:, cl_idx] += kmeans_X * np.asarray(np.sum(np.square(X - center), axis=1)).squeeze()

labels_prev = labels.copy()
labels = reassign_labels(scores, constr)
#     if verbose > 1:
#       corr_preds = preds[np.arange(preds.shape[0]), labels]
#       print("Iter #{}: obj = {:.6f}, MSE = {:.6f}, r2 = {:.6f}".format(
#             it, np.mean(scores[np.arange(preds.shape[0]), labels]),
#             mse_score(y, corr_preds), r2_score(y, corr_preds),
#       ))
#     if np.allclose(labels, labels_prev):
#       break
obj = np.mean(scores[np.arange(preds.shape[0]), labels])
# if verbose == 1:
#     corr_preds = preds[np.arange(preds.shape[0]), labels]
#     print("Iter #{}: obj = {:.6f}, MSE = {:.6f}, r2 = {:.6f}".format(
#           it, obj, mse_score(y, corr_preds), r2_score(y, corr_preds),
#     ))
weights = (labels == np.arange(k)[:,np.newaxis]).sum(axis=1).astype(np.float)
weights /= np.sum(weights)
# #   return labels, models, weights, obj
weights, labels, models, obj


(array([0.5 , 0.25, 0.25]),
 array([0, 1, 2, 0, 0, 0, 2, 0, 2, 1, 0, 1, 2, 2, 0, 0, 1, 0, 1, 0],
       dtype=int64),
 [Ridge(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='auto', tol=0.001),
  Ridge(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='auto', tol=0.001),
  Ridge(alpha=1e-05, copy_X=True, fit_intercept=True, max_iter=None,
        normalize=False, random_state=None, solver='auto', tol=0.001)],
 348.97956753264197)

In [274]:
scores

array([[ 7593.13212142,   400.        ],
       [ 6149.96741819, 19600.        ],
       [  401.19850123, 10000.        ],
       [ 2721.55448976,     0.        ],
       [  127.98596259,  1600.        ]])

In [276]:
preds

array([[147.13857998,  80.        ],
       [141.57827203,  80.        ],
       [159.97005988,  80.        ],
       [132.1685201 ,  80.        ],
       [131.31308811,  80.        ]])

In [261]:
scores

array([[ 3600.        ,  6306.35502447],
       [10000.        ,  8711.39228015],
       [ 3600.        ,   124.77687513],
       [ 1600.        ,   629.64030285],
       [    0.        ,   284.54643703]])

In [249]:
scores

array([[2.37861344e+04, 5.93190217e+02, 7.57029374e+03],
       [2.39007416e+01, 1.70318223e+04, 3.64418681e+03],
       [9.26318911e+01, 1.15567945e+04, 3.85991979e+03],
       [2.65465220e+04, 3.30871158e+02, 1.02008123e+04],
       [1.55179924e+04, 4.41823608e+02, 3.96162610e+03],
       [1.41962792e+04, 1.76169808e+02, 2.79149205e+03],
       [3.10348581e+03, 6.41773328e+03, 8.82831394e+01],
       [1.22740561e+04, 5.05456611e+02, 2.00497540e+03],
       [3.89732248e+03, 5.13971805e+03, 1.07627900e+01],
       [3.96317092e-01, 1.89720736e+04, 4.04098893e+03],
       [1.94064241e+04, 1.43613638e+02, 5.06663474e+03],
       [7.12105489e+03, 1.63116753e+03, 2.35719517e+02],
       [6.64219651e+02, 2.42252114e+04, 8.64761716e+03],
       [5.70782307e+03, 4.27291589e+03, 1.50267496e+02],
       [1.88647857e+04, 4.62099401e-01, 5.26299008e+03],
       [3.01682145e+04, 6.13819915e+01, 1.42246634e+04],
       [1.12987061e+02, 1.63172906e+04, 2.86961347e+03],
       [1.49498885e+04, 2.87255

In [207]:
(labels == np.arange(k)[:,np.newaxis]).sum(axis=1).astype(np.float)

array([6., 5., 9.])

In [221]:
x = 12
x /= 3
x

4.0

In [194]:
A = np.random.randint(10, size=(4, 5))
A

array([[2, 4, 7, 2, 8],
       [3, 1, 8, 3, 1],
       [2, 9, 1, 1, 3],
       [7, 5, 9, 5, 5]])

In [197]:
np.shape(A)

(4, 5)

In [199]:
B=A[:,np.newaxis]
np.shape(B)

(4, 1, 5)

In [98]:
>>> x = np.array([[[[0], [1], [2]]]])
>>> xy=x.squeeze()
# x
xy

array([0, 1, 2])

In [99]:
>>> xy.squeeze()


array([0, 1, 2])