In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
def Normalize(data):
    data = (data-data.min(0)) / (data.max(0)-data.min(0))
    return data

class Data_split(object):
    def __init__(self, data, cl = None, trainsize = 0.7, shuffle = True, randomstate = None, scaling = True):
        """Split arrays or matrices into random train and test subsets
        
        Parameters
        ----------
        data: data set input
        
        cl: the index of the column including the classes of every record 
            (default = the last column of data)
            
        trainsize: float, int or None, optional (default=0.7)
            If float, should be between 0.0 and 1.0 and represent the proportion 
            of the dataset to include in the test split. If int, represents the
            absolute number of test samples. If None, the value is set to the
            complement of the train size. By default, the value is set to 0.25
            
        shuffle: 
            Whether or not to shuffle the data before splitting.
                    
        """
        np0 = data.shape
        self.n = np0[0]
        self.p = np0[1]
        self.shuffle = shuffle
        self.rs = randomstate
        
        if self.shuffle is True:
            if self.rs is not None: 
                np.random.seed(self.rs)
            self.data = np.random.permutation(data)
        self.cl = self.p-1 if cl is None else cl
        self.X = np.delete(self.data, self.cl, axis=1)
        
        if scaling is True:
            self.X = Normalize(self.X)
        self.Y = self.data[:, self.cl]
        self.trainsize = trainsize
        
    def data_split(self):
        """
        Returns: trainx, trainy, testx, testy
        """
        if type(self.trainsize) == int:
            self.trainsize = trainsize
        elif type(self.trainsize)  == float or self.trainsize is None:
            if self.trainsize <0 or self.trainsize>1:
                raise ValueError("The proportion of training data should between 0.0 and 1.0")
            else:
                self.trainsize = int(self.trainsize * self.n)
        self.trainX = self.X[0:self.trainsize, :]
        self.trainY = self.Y[0:self.trainsize]
        self.testX = self.X[self.trainsize:, :]
        self.testY = self.Y[self.trainsize:]
        trainX = self.trainX
        trainY = self.trainY
        testX = self.testX
        testY = self.testY
        return trainX, trainY, testX, testY

In [3]:
 def rbf(x, c, s):
        return np.exp(-1 / (2 * s**2) * (x-c)**2)

In [4]:
def kmeans(X, k):
    """Performs k-means clustering for 1D input
    
    WE NEED TO UPDATE FOR MULTIDIMENSIONAL INPUT!
    
    Arguments:
        X {ndarray} -- A Mx1 array of inputs
        k {int} -- Number of clusters
    
    Returns:
        ndarray -- A kx1 array of final cluster centers
    """
 
    # randomly select initial clusters from input data
    clusters = np.random.choice(np.squeeze(X), size=k)
    prevClusters = clusters.copy()
    stds = np.zeros(k)
    converged = False
 
    while not converged:
        """
        compute distances for each cluster center to each point 
        where (distances[i, j] represents the distance between the ith point and jth cluster)
        """
        distances = np.squeeze(np.abs(X[:, np.newaxis] - clusters[np.newaxis, :]))
 
        # find the cluster that's closest to each point
        closestCluster = np.argmin(distances, axis=1)
 
        # update clusters by taking the mean of all of the points assigned to that cluster
        for i in range(k):
            pointsForCluster = X[closestCluster == i]
            if len(pointsForCluster) > 0:
                clusters[i] = np.mean(pointsForCluster, axis=0)
 
        # converge if clusters haven't moved
        converged = np.linalg.norm(clusters - prevClusters) < 1e-6
        prevClusters = clusters.copy()
 
    distances = np.squeeze(np.abs(X[:, np.newaxis] - clusters[np.newaxis, :]))
    closestCluster = np.argmin(distances, axis=1)
 
    clustersWithNoPoints = []
    for i in range(k):
        pointsForCluster = X[closestCluster == i]
        if len(pointsForCluster) < 2:
            # keep track of clusters with no points or 1 point
            clustersWithNoPoints.append(i)
            continue
        else:
            stds[i] = np.std(X[closestCluster == i])
 
    # if there are clusters with 0 or 1 points, take the mean std of the other clusters
    if len(clustersWithNoPoints) > 0:
        pointsToAverage = []
        for i in range(k):
            if i not in clustersWithNoPoints:
                pointsToAverage.append(X[closestCluster == i])
        pointsToAverage = np.concatenate(pointsToAverage).ravel()
        stds[clustersWithNoPoints] = np.mean(np.std(pointsToAverage))
 
    return clusters, stds

In [5]:
class RBFNet(object):
    """Implementation of a Radial Basis Function Network"""
    def __init__(self, k=2, lr=0.01, epochs=100, rbf=rbf, inferStds=True):
        self.k = k
        self.lr = lr
        self.epochs = epochs
        self.rbf = rbf
        self.inferStds = inferStds
 
        self.w = np.random.randn(k)
        self.b = np.random.randn(1)

In [6]:
def fit(self, X, y):
    if self.inferStds:
        # compute stds from data
        self.centers, self.stds = kmeans(X, self.k)
    else:
        # use a fixed std 
        self.centers, _ = kmeans(X, self.k)
        dMax = max([np.abs(c1 - c2) for c1 in self.centers for c2 in self.centers])
        self.stds = np.repeat(dMax / np.sqrt(2*self.k), self.k)
 
    # training
    for epoch in range(self.epochs):
        for i in range(X.shape[0]):
            # forward pass
            a = np.array([self.rbf(X[i], c, s) for c, s, in zip(self.centers, self.stds)])
            F = a.T.dot(self.w) + self.b
 
            loss = (y[i] - F).flatten() ** 2
            print('Loss: {0:.2f}'.format(loss[0]))
 
            # backward pass
            error = -(y[i] - F).flatten()
 
            # online update
            self.w = self.w - self.lr * a * error
            self.b = self.b - self.lr * error

In [7]:
def predict(self, X):
    y_pred = []
    for i in range(X.shape[0]):
        a = np.array([self.rbf(X[i], c, s) for c, s, in zip(self.centers, self.stds)])
        F = a.T.dot(self.w) + self.b
        y_pred.append(F)
    return np.array(y_pred)

In [23]:
mamo = pd.read_csv('mammographic_masses.data', header = None, 
                   names = ['BI', 'Age', 'Shape', 'Margin','Density', 'Class'])
mamo = mamo.replace('?',-1)
mamo = mamo.astype(int)
mamo.head()

Unnamed: 0,BI,Age,Shape,Margin,Density,Class
0,5,67,3,5,3,1
1,4,43,1,1,-1,1
2,5,58,4,5,3,1
3,4,28,1,1,3,0
4,5,74,1,5,-1,1


In [24]:
mamo = np.asarray(mamo)
Ma = Data_split(mamo, cl=5, randomstate = 648)

In [26]:
Ma

<__main__.Data_split at 0x1086ef6d8>