###Week5: (Logistic Regression)

In [None]:
def linear_combination(X, w):
    return X @ w

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def activation(X, w):
    return sigmoid(linear_combination(X, w))

def predict(X, w, threshold):
    return np.where(activation(X, w) > threshold, 1, 0)

def loss(y, sigmoid_vector, weight_vector, l1_reg_rate, l2_reg_rate):
    loss_orig = -1 * np.sum(y * np.log(sigmoid_vector) + (1 - y) * np.log(1 - sigmoid_vector))
    l2_reg = l2_reg_rate * np.dot(weight_vector.T, weight_vector)
    l1_reg = l1_reg_rate * np.sum(np.abs(weight_vector))
    return loss_orig + l1_reg + l2_reg

def calculate_gradient(X, y, w, reg_rate):
    return X.T @ (sigmoid(linear_combination(X, w)) - y) + reg_rate * w

In [None]:
class LogisticRegression():
    def set_weight_vector(self, w):
        self.w = w
    def linear_combination(self, X):
        return X @ self.w
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def activation(self, X):
        return self.sigmoid(self.linear_combination(X))
    def predict(self, X, threshold=0.5):
        return np.where(self.activation(X) > threshold, 1, 0)
    def loss(self, X, y, reg_rate):
        sigmoid_vector = self.activation(X)
        loss_orig = -1 * np.sum(y * np.log(sigmoid_vector) + (1 - y) * np.log(1 - sigmoid_vector))
        reg = reg_rate * np.dot(self.w.T, self.w)
        return loss_orig + reg
    def calculate_gradient(self, X, y, reg_rate):
        return X.T @ (self.activation(X) - y) + reg_rate * self.w
    def update_weights(self, grad, lr):
        return self.w - grad * lr
    def gd(self, X, y, num_epochs, lr, reg_rate):
        self.w = np.zeros(X.shape[1])
        self.w_all = []
        self.err_all = []
        for i in range(num_epochs):
            grad = self.calculate_gradient(X, y, reg_rate)
            self.w_all.append(self.w)
            self.err_all.append(self.loss(X, y, reg_rate))
            self.w = self.update_weights(grad, lr)
        return self.w

In [None]:
import itertools, functools
def combinations(x, degree):
    return itertools.combinations_with_replacement(x, degree)

def compute_new_features(items):
    return functools.reduce(lambda x, y: x * y, items)

def polynomial_transform(x, degree):
    if x.ndim == 1:
        x = x[:, None]

    features = [np.ones(len(x))] #a list of np-arrays with all 1.

    for degree in range(1, degree + 1):
        for item in combinations(x.T, degree):
            features.append(compute_new_features(item))
            
    return np.asarray(features).T
def plot_learning_curve(err):
    plt.plot(np.arange(len(err)), err, 'r-')

In [None]:
def create_toy_data():
    x0 = np.random.normal(size=50).reshape(-1, 2) - 1
    x1 = np.random.normal(size=50).reshape(-1, 2) + 1
    return np.concatenate([x0, x1]), np.concatenate([np.zeros(25), np.ones(25)]).astype(np.int)

sns.set_context(context='notebook',font_scale=1.5,rc={"lines.linewidth":2.5})

def visualize_model(X_train, labels, lsc, degree=1):
    f = plt.figure(figsize=(8,8))

    x1_min = np.min(x_train[:,1])
    x1_max = np.max(x_train[:,1])
    x2_min = np.min(x_train[:,2])
    x2_max = np.max(x_train[:,2])
    
    x1_test, x2_test = np.meshgrid(np.linspace(x1_min, x1_max, 100), np.linspace(x2_min, x2_max, 100)) 
    x_test = np.array([x1_test, x2_test]).reshape(2,-1).T
    x_test_poly = polynomial_transform(x_test, degree=degree)
    y_test = lsc.predict(x_test_poly, 0.5)
    
    sns.scatterplot(data=x_train, x=x_train[:,1], y=x_train[:,2], hue=labels)
    plt.contourf(x1_test, x2_test, y_test.reshape(100,100), alpha=0.5, levels=np.linspace(0,1,3))
    plt.gca().set_aspect('equal', adjustable='box')

###Week6:(Naive Bayes)

In [None]:
def fit(X, y):
    alpha = 1 #Laplace correction
    n_samples, n_features = X.shape
    n_classes = len(np.unique(y))
    w = np.zeros((n_classes, n_features), dtype=np.float64)
    w_priors = np.zeros(n_classes, dtype=np.float64)
    
    for c in range(n_classes):
        X_c = X[y == c]
        w[c, :] = (np.sum(X_c, axis=0) + alpha) / (X_c.shape[0] + n_classes * alpha)
        w_priors[c] = (X_c.shape[0] + alpha) / (float(n_samples) + n_classes * alpha)
        
    print("Weight vector:", w)
    print("Prior", w_priors)
    return w, w_priors

In [None]:
class BernoulliNB():
    def __init__(self, alpha=1.0):
        self.alpha = alpha
    def fit(self, X, y):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        self.w = np.zeros((n_classes, n_features), dtype=np.float64)
        self.w_priors = np.zeros(n_classes, dtype=np.float64)

        for c in range(n_classes):
            X_c = X[y == c]
            self.w[c, :] = (np.sum(X_c, axis=0) + self.alpha) / (X_c.shape[0] + n_classes * self.alpha)
            self.w_priors[c] = (X_c.shape[0] + self.alpha) / (float(n_samples) + n_classes * self.alpha)

        print("Class Conditional Density:", self.w)
        print("Prior", self.w_priors)

    def log_likelihood_prior_prod(self, X):
        return X @ (np.log(self.w).T) + (1 - X) @ np.log((1 - self.w).T) + np.log(self.w_priors)
    
    def predict_proba(self, X):
        q = self.log_likelihood_prior_prod(X)
        return np.exp(q) / np.expand_dims(np.sum(np.exp(q), axis=1), axis=1)
    
    def predict(self, X):
        q = self.log_likelihood_prior_prod(X)
        return np.argmax(q, axis=1)

In [None]:
class GaussianNB():
    def __init__(self, alpha=1.0):
        self.alpha = alpha
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
        self._var = np.zeros((n_classes, n_features), dtype=np.float64)
        self._priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            self._mean[idx, :] = X_c.mean(axis=0)
            self._var[c] = X_c.var(axis=0)
            self._priors[idx] = X_c.shape[0] / float(n_samples)

        print("Mean:", self._mean)
        print("Variance:", self._var)
        print("Prior", self._priors)
    
    def _calc_pdf(self, class_idx, X):
        mean = self._mean[class_idx]
        var = np.diag(self._var[class_idx])
        z = np.power(2 * np.pi, X.shape[0] / 2) * np.power(np.linalg.det(var), 1/2)
        return (1/z) * np.exp(-0.5 * (X - mean).T @ (np.linalg.inv(var)) @ (X - mean))
    
    def _calc_prod_likelihood_prior(self, X):
        self.q = np.zeros((X.shape[0], len(self._classes)), dtype=np.float64)
        for x_idx, x in enumerate(X):
            for idx, c in enumerate(self._classes):
                self.q[x_idx, c] = (np.log(self._calc_pdf(idx, x)) 
                                                         + np.log(self._priors[idx]))
    
    def predict_proba(self, X):
        self._calc_prod_likelihood_prior(X)
        return np.exp(self.q) / np.expand_dims(np.sum(np.exp(self.q), axis=1), axis=1)
    
    def predict(self, X):
        self._calc_prod_likelihood_prior(X)
        return np.argmax(self.q, axis=1)

In [None]:
class MultinomialNB():
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self._classes = np.unique(y)
        n_classes = len(self._classes)
        self.w = np.zeros((n_classes, n_features), dtype=np.float64)
        self.w_priors = np.zeros(n_classes, dtype=np.float64)

        for idx, c in enumerate(self._classes):
            X_c = X[y == c]
            total_count = np.sum(np.sum(X_c, axis=1))
            self.w[idx, :] = (np.sum(X_c, axis=0) + self.alpha) / (total_count + n_classes * self.alpha)
            self.w_priors[idx] = (X_c.shape[0] + self.alpha) / (float(n_samples) + n_classes * self.alpha)

    def log_likelihood_prior_prod(self, X):
        return X @ (np.log(self.w).T) + np.log(self.w_priors)
    
    def predict_proba(self, X):
        q = self.log_likelihood_prior_prod(X)
        return np.exp(q) / np.expand_dims(np.sum(np.exp(q), axis=1), axis=1)
    
    def predict(self, X):
        q = self.log_likelihood_prior_prod(X)
        return np.argmax(q, axis=1)

###Week7:(Softmax Regression, KNN)

In [None]:
# For softmax regression, we've to one-hot encode y
def convert_to_one_hot_encoding(y, k):
    #k = len(np.unique(y)) #get value of k (classes) 
    y_unique = np.unique(y) # array of unique classes
    y_sort = sorted(y_unique) #sort the classes
    for i in range(len(y_unique)):
        y[i] = y_sort.index([y[i]])
    print(y) #converts to values starting from 0
    y_one_hot = np.zeros((len(y),k)) #initialize
    y_one_hot[np.arange(len(y)), y] = 1 #one hot rncode
    
    return y_one_hot

In [None]:
def linear_combination(X, w, b):
    return X @ w + b
def softmax(z):
    # subtracting max of z for numerical stability
    exp = np.exp(z - np.max(z))
    for i in range(len(z)):
        exp[i] /= np.sum(exp[i])
    
    return exp


In [None]:
def fit(X, y, lr, k, epochs):
    n, m = X.shape
    
    w = np.random.random((m, k))
    b = np.random.random(k)
    
    losses = []
    
    for epoch in range(epochs):
        z = linear_combination(X, w, b)
        y_hat = softmax(z)
        y_hot = convert_to_one_hot_encoding(y, k)
        
        w_grad = (1/n) * (X.T @ (y_hat - y_hot))
        b_grad = (1/n) * np.sum(y_hat - y_hot)
        
        w = w - lr * w_grad
        b = b - lr * b_grad
        
        loss = -np.mean(np.log(y_hat[np.arange(len(y)), y]))
        losses.append(loss)
    return w, b, losses

def predict(X, w, b):
    z = X @ w + b
    y_hat = softmax(z)
    return np.argmax(y_hat, axis=1)

def accuracy(y, y_hat):
    return np.sum(y == y_hat) / len(y)



In [None]:
def EuclideanDistance(x1,x2):
  dist = np.sum((x1-x2)**2,axis=1)
  return dist

def ManhattanDistance(x1,x2): 
  np.sum(np.abs(x1-x2),axis=1)

In [None]:
class KNN:
  def __init__(self,k,distance_metric=EuclideanDistance,task_type="Classification"):
    self._k = k 
    self._distance_metric = distance_metric 
    self._task_type = task_type 
  
  def fit(self,X,y):
    #fitting model on data only requires copying the data
    self._X = X 
    self._y = y 
  def predict(self,newExample):

    #newExample is example for which prediction has to be made.
    ''' label: predicted label for newExample
    k_nearest_neighbours_indices : indices of the nearest k neighbours

    '''
    #2. calculate the distance between the new example and every example from 
    # data. Thus create a distance vector.

    distance_vector = self._distance_metric(self._X,newExample)

    # Get indices of nearest k neighbours

    k_nearest_neighbours_indices = np.argpartition(distance_vector,self._k)[:self._k]
    #4 Get the labels of the selected k entries.
    k_nearest_neighbours = self._y[k_nearest_neighbours_indices]
    # If it is a classification task, return the majority class by computing mode 
    #the k labels
    if self._task_type == 'Classification':
      label = stats.mode(k_nearest_neighbours)[0] 
    else:
      label = k_nearest_neighbours.mean() 
    
    return label, k_nearest_neighbours_indices 

  def eval(self, X_test,y_test):
    if self._task_type == 'Classification':
      y_predicted = np.zeros(y_test.shape) 
      for i in range(y_test.shape[0]):
        y_predicted[i],_ = self.predict(X_test[i,:])
      error = np.mean(y_test==y_predicted, axis = 0)
    
    else:
      y_predicted = np.zeros(y_test.shape) 
      for i in range(y_test.shape[0]):
        y_predicted[i],_ = self.predict(X_test[i,:]) 
        error_vector = y_predicted - y_test 
        error = np.sqrt((error_vector.T@error_vector)/error_vector.ravel().shape[0])
    return error
    



In [None]:
def draw_decision_boundary(model, axis_chart, num_points=201, opacity=0.05):
  tx = np.linspace(np.min(model._X[:,0],axis=0)-2,
                   np.max(model._X[:,0],axis=0)+2, num_points)
  ty = np.linspace(np.min(model._X[:,0],axis=0)-2,
                   np.max(model._X[:,0],axis=0)+2, num_points) 
  xx,yy = np.meshgrid(tx,ty) 
  grid_prediction = np.zeros(xx.shape) 

  for i in range(num_points):
    for j in range(num_points):
      grid_prediction[i][j],_ = model.predict([xx[i][j],yy[i][j]]) 
  
  axis_chart.scatter(xx.ravel(),yy.ravel(),c=grid_prediction.ravel(),alpha=opacity)
  

In [None]:
def maketwospirals(num_points=1000,rotations =2, noise = .5):
  ''' 
  Parameters:
  num_points: Number of points to be generated per spiral
  rotations : How many times a spiral should rotate
  noise: noise factor 

  Returns
  -------
  coordinates and class vectors of the two spirals 


  ''' 
  n = np.sqrt(np.random.rand(num_points,1))*rotations * (2*np.pi)
  rx = -np.cos(n)*n + np.random.rand(num_points,1) * noise 
  ry =  np.sin(n)*n + np.random.rand(num_points,1)*noise

  return (np.concatenate((np.hstack((rx,ry)),
                          np.hstack((-rx,-ry))),
                         axis=0),
          np.concatenate((np.zeros(num_points),
                          np.ones(num_points)),
                         axis=0)) 
  

### Week8:(SVM)

In [None]:
class softSVM:

  def __init__(self,C):
    self._support_vectors = None 
    self.C = C
    self.w = None 
    self.b = None 
    self.X = None 
    self.y = None 

    # n is the number of data points 
    self.n = 0 
    # d is the number of dimensions 
    self.d = 0 

  def __decision_function(self,X):
    return X.dot(self.w) + self.b 

  def __cost(self,margin):
    return (1/2)*(self.w).dot(self.w) + self.C*np.sum(np.maximum(0,1-margin))
  
  def __margin(self, X,y):
    return y*self.__decision_function(X) 

  def fit(self,X,y, lr=1e-3, epochs=500):
    # Initialize w and b 
    self.n, self.d = X.shape 
    self.w = np.random.randn(self.d) 
    self.b = 0 

    #required only for plotting 
    self.X = X 
    self.y = y 

    loss_array = []
    
    for _ in range(epochs):
      margin = self.__margin(X,y) 
      loss = self.__cost(margin) 
      loss_array.append(loss)

      missclassified_pts_idx = np.where(margin < 1)[0] 
      d_w = self.w - self.C * y[missclassified_pts_idx].dot(X[missclassified_pts_idx]) 

      self.w = self.w - lr * d_w 

      d_b = -self.C * np.sum(y[missclassified_pts_idx]) 
      self.b = self.b - lr * d_b 
    self._support_vectors = np.where(self.__margin(X,y) <= 1)[0] 

  def predict(self,X):
    return np.sign(self.__decision_function(X)) 

  def score(self,X,y):
    p = self.predict(X) 
    return np.mean(y==p) 

  def plot_decision_boundary(self):
    plt.scatter(self.X[:,0],self.X[:,1],c = self.y, marker='o',s = 100, cmap = 'autumn') 

    ax = plt.gca() 
    xlim = ax.get_xlim() 
    ylim = ax.get_ylim() 

    #create grid to evaluate model 

    xx = np.linspace(xlim[0],xlim[1],30) 
    yy = np.linspace(ylim[0],ylim[1],30) 
    YY,XX = np.meshgrid(yy,xx) 
    xy = np.vstack([XX.ravel(),YY.ravel()]).T 
    z = self.__decision_function(xy).reshape(XX.shape) 

    #plot decision boundary and margins 

    ax.contour(XX,YY,z, colors=['g','k','g'],levels = [-1,0,1],
               linestyles=['--','-','--'],linewidths = [2.0,2.0,2.0]) 

    #highlight the support vectors 

    ax.scatter(self.X[:,0][self._support_vectors],
               self.X[:,1][self._support_vectors],s =250,
               linewidth =1 , facecolors = 'none',edgecolors = 'k') 

    plt.xlabel('x1') 
    plt.ylabel('x2') 

    plt.show() 

In [None]:
X,Y = make_blobs(n_samples=60,n_features =2, centers = [[0,0],[6,6]],
                 cluster_std=1,
                 random_state=12)

Y = np.where(Y == 0 , -1 ,1) 

#plot the two classes 
plt.figure(figsize=(8,8)) 
plt.scatter(X[:,0],X[:,1],marker = 'o', c =Y , s =200, edgecolor='k') 
plt.xlabel('x1',fontsize=20) 
plt.ylabel('x2',fontsize=20) 
plt.title('Data points',fontsize = 20) 
plt.show()

In [None]:
svm = softSVM(C=1) 
svm.fit(X,Y) 

print(svm.w) 
def get_hyperplane_value(x,w,b,offset):
  return -1*(w[0]*x+b+offset)/w[1] 

fig = plt.figure(figsize=(8,8)) 
ax = fig.add_subplot(1,1,1) 
plt.scatter(X[:,0],X[:,1],marker='o',c=Y, s =200,edgecolor = 'k',cmap='autumn') 
x0_1 = np.amin(X[:,0]) 
x0_2 = np.amax(X[:,0]) 

x1_1 = get_hyperplane_value(x0_1, svm.w, svm.b,0)
x1_2 = get_hyperplane_value(x0_2, svm.w, svm.b,0)

x1_1_m = get_hyperplane_value(x0_1, svm.w, svm.b,-1)
x1_2_m = get_hyperplane_value(x0_2, svm.w, svm.b,-1)

x1_1_p = get_hyperplane_value(x0_1, svm.w, svm.b,1)
x1_2_p = get_hyperplane_value(x0_2, svm.w, svm.b,1) 


ax.plot([x0_1,x0_2],[x1_1,x1_2],'y') 
ax.plot([x0_1,x0_2],[x1_1_m,x1_2_m],'k--') 
ax.plot([x0_1,x0_2],[x1_1_p,x1_2_p],'k--')

x1_min = np.amin(X[:,1]) 
x1_max = np.amax(X[:,1]) 
ax.set_ylim([x1_min -3, x1_max + 3]) 

ax.legend(['Hyperplane','Bounding plane 1', 'Bounding plane 2'],loc='center left', bbox_to_anchor=(1,0.5))
plt.xlabel('x1') 
plt.ylabel('x2') 
plt.show()


In [None]:
class SVMDualProblem:

  def __init__(self,C=1.0, kernel='rbf', sigma=0.1,degree =2):
    self.C = C 
    if kernel == 'poly':
      self.kernel= self._polynomial_kernel
      self.c = 1
      self.degree = degree 

    else: 
      self.kernel = self._rbf_kernel 
      self.sigma = sigma 

    self.X = None 
    self.y = None 
    self.alpha = None 
    self.b = 0 
    self.ones = None 

  def _rbf_kernel(self,X1,X2):
    return np.exp(-1(1/self.sigma**2)*np.linalg.norm(X1[:,np.newaxis]-X2[np.newaxis,:],axis=2)**2) 

  def _polynomial_kernel(self, X1,X2):
    return (self.c + X1.dot(X2.T))**self.degree 
  
  def fit(self, X, y, lr =1e-3, epochs =100):

    self.X = X
    self.y = y 

    self.alpha = np.random.random(X.shape[0]) 
    self.b = 0

    self.ones = np.ones(X.shape[0]) 

    y_iy_jk_ij = np.outer(y,y)*self.kernel(X,X) 

    losses = [] 

    for _ in range(epochs):
      gradient = self.ones - y_iy_jk_ij.dot(self.alpha) 

      self.alpha = self.alpha + lr*gradient 

      self.alpha[self.alpha > self.C] = self.C 
      self.alpha[self.alpha < 0] = 0 

      loss = np.sum(self.alpha) -0.5 * np.sum(
          np.outer(self.alpha,self.alpha)*y_iy_jk_ij) 
      
      losses.append(loss) 

    index = np.where((self.alpha)>0 & (self.alpha < self.C))[0] 

    b_i = y[index] - (self.alpha*y).dot(self.kernel(X,X[index])) 

    #alternative code 
    # b_1 = y[index] - np.sum((self.alpha*y).reshape(-1,1)*self.kernel(X,X[index]),axis =0)

    self.b = np.mean(b_i) 
    plt.plot(losses) 
    plt.title("loss per epochs") 
    plt.show() 

  def __decision_function(self,X):
    return (self.alpha*self.y).dot(self.kernel(self.X,X)) + self.b 

  def predict(self,X):
    return np.sign(self.__decision_function(X))

  def score(self,X,y):
    y_hat = self.predict(X) 
    return np.mean(y==y_hat) 

  def plot_decision_boundary(self):
    plt.scatter(self.X[:,0],self.X[:,1], c = self.y, s=50, cmap = plt.cm.Paired, alpha =0.5) 
    ax = plt.gca() 

    xlim = ax.get_xlim()
    ylim = ax.get_ylim() 

    #create grid to evaluate model 
    xx = np.linspace(xlim[0],xlim[1],30)
    yy = np.linspace(ylim[0],ylim[1],30) 
    YY,XX = np.meshgrid(yy,xx) 
    xy = np.vstack([XX.ravel(),YY.ravel()]).T 
    z = self.__decision_function(xy).reshape(XX.shape) 

    #plot decision boundary and margins 

    ax.contour(XX,YY, z, colors=['b','g','r'],levels = [-1,0,1],alpha=0.5,
               linestyles = ['--','-','--'],linewidths = [2.0,2.0,2.0]) 
    ax.scatter(self.X[:,0][self.alpha > 0.],self.X[:,1][self.alpha > 0.],s=50,
               linewidths=1, facecolors ='none', edgecolors = 'k') 
    plt.show()

 
    

### Week9

In [None]:
import pandas as pd
import numpy as np

In [None]:
def find_entropy_whole(df):
    #last column in the dataframe is target variable
    target = df.keys()[-1]

    #initialization
    overall_entropy = 0

    #possible values of the target
    values_in_target = df[target].unique()
    
    for value in values_in_target:
        p = df[target].value_counts()[value] / len(df[target])
        overall_entropy += -p * np.log2(p)
    return overall_entropy

In [None]:
def find_entropy_of_attribute(df, attribute):
    #last column in dataframe is label
    target = df.keys()[-1]

    #possible values of the target
    values_in_target = df[target].unique()
    
    #this gives different features in that attribute (
        # like hot, cold in temperature )  
    values_in_attribute = df[attribute].unique()

    #initialize attribute entropy
    entropy_attribute = 0

    #for loop implementation
    for value_in_attribute in values_in_attribute:
        overall_entropy = 0
        for value_in_target in values_in_target:
            num = len(df[attribute][df[attribute] == value_in_attribute][df[target] == value_in_target])
            den = len(df[attribute][df[attribute] == value_in_attribute])
            p = num / (den + eps)
            overall_entropy += -p * np.log2(p + eps)
        p2 = den / len(df)
        entropy_attribute += -p2 * overall_entropy
    return abs(entropy_attribute)

In [None]:
for attribute in df.keys()[:-1]:
    print(f'Entropy of the attribute "{attribute}" is :', find_entropy_of_attribute(df, attribute))

In [None]:
def find_best_attribute_to_divide(df):
    #information gain initialization
    IG = []
    #get all column names
    all_attributes = df.keys()[:-1]

    #get information gain for every attribute
    for attribute in all_attributes:
        IG.append(find_entropy_whole(df) - find_entropy_of_attribute(df, attribute))
    
    #get the index of attribute with best information gain
    index_of_attribute_with_max_IG = np.argmax(IG)

    #print index of attribute with maximum gain
    best_attribute = all_attributes[index_of_attribute_with_max_IG]
    return best_attribute


In [None]:
def buildTree(df, tree=None):
    #last column in our dataframe
    target = df.keys()[-1]

    #get attribute with maximum information gain
    node = find_best_attribute_to_divide(df)

    #get distinct value of that attribute
    attValue = np.unique(df[node])

    #create an empty dictionary to create tree
    if tree is None:
        tree = {}
        tree[node] = {}

    #we make a loop to construct a tree by calling the function recursively 
    #we check if the subset is pure, we stop if pure
    for value in attValue:
        subtable = df[df[node] == value].reset_index(drop=True)
        clValue, counts = np.unique(subtable['play'], return_counts=True)
        if len(counts) == 1: #Checking purity of subset
            tree[node][value] = clValue[0]
        else:
            tree[node][value] = buildTree(subtable) # Calling the function recusively
    return tree



###Week10: (Random Forest, Gradient Boosting)

---



In [None]:
import numpy as np
def bag (X,y):
  #Counts the number or rows in the feature matrix
  n_samples = X.shape[0] 

  #Generates a random sample from the  given input.
  indices = np.random.choice(n_samples,size = n_samples,replace=True,random_state=1)
  # Note that the second argument size has been set to the size of 
  # the original sample dataset and replacement has been set to True

  return X[indices], y[indices]

In [None]:
def most_common_label(y):
  counter = Counter(y)
  most_common = counter.most_common(1)[0][0] 
  return most_common


y = [1,1,1,0,0,2,2,2,2,3,3,3]
print(Counter(y))
print(Counter(y).most_common(2)[0][0])


In [None]:
class RandomForest:
  def __init__(self, n_trees=10, min_samples_split=2, max_depth=100, max_features=None):
    self.n_trees = n_trees #hyperparameter for fixing number of trees to be generated
    self.min_samples_split = min_samples_split # min no of samples required for split
    self.max_depth=max_depth #maximum depth of decision tree
    self.max_features = max_features #maximum no of features to be considered
    self.trees = [] 

In [None]:
def fit(self,X,y):
  self.trees =[] #Empty array of trees which gets filled in during operations.
  for _ in range(self.n_trees): #we are using underscore we are just repeating the operations.
    tree = DecisionTreeClassifier( #we will now make RF class inherit features from Sklearn'
                                  min_samples_split = self.min_samples_split,
                                  max_depth = self.max_depth,
                                  max_features=self.max_features
                                
        
    )
    X_sample, y_sample = bag(X,y) 
    tree.fit(X_sample,y_sample) 
    self.trees.append(tree) # we will append each of these tree.
    


In [None]:
def predict(self,X):
  tree_predict = np.array([tree.predict(X) for tree in self.trees])
  tree_predict = np.swapaxes(tree_predict,0,1) #each of the trees will give out predictions
  y_pred = [most_common_label(tree_pred) for tree_pred in tree_predict] 
  return np.array(y_pred) 

  

In [None]:
def accuracy(y_true, y_pred):
  accuracy = np.sum(y_true == y_pred)/len(y_true) 
  return accuracy 

In [None]:
def GradBoost(model, X_train, y_train, X_test, boosting_rounds, learning_rate: float = 0.1):
  # make a first guess of our training target variable using the mean of y_train
  y_hat_train = np.repeat(np.mean(y_train), len(y_train))

  # initialize the test prediction with the mean of the training target variable
  y_hat_test = np.repeat(np.mean(y_train), len(X_test)) 

  #calculate the residuals from the training data using the first guess
  residuals = y_train - y_hat_train 

  #iterates through the boosting round.
  for i in range(0,boosting_rounds):
    #fit the model to the residuals
    model = model.fit(X_train, residuals) 

    #increment the predicted training y with the pseudo residual * learning rate
    y_hat_train = y_hat_train + learning_rate*model.predict(X_train) 

    #increment the predicted test y as well
    y_hat_test = y_hat_test + learning_rate * model.predict(X_test) 

    #calculate the residuals for the next round 

    residuals = y_train - y_hat_train 
  return y_hat_train, y_hat_test


###Week11: (Clustering)

In [None]:
def fit(X,k):
  # take three random points from data set and take them as initial centroids

  centroids = X[np.random.randint(0,X.shape[0],size=k)] 

  #calculate initial label of each data point
  labels = np.argmin(cdist(X,centroids), axis=1)

  for _iteration in range(10):
    #copy labels for all points for comparing later
    previous_labels =labels.copy()

    #compute new centroids
    centroids = np.array([np.mean(X[labels==r], axis=0) for r in range(k)])  

    #at last recalculate label of each data point 
    labels = np.argmin(cdist(X,centroids),axis=1) 

    #check if labels of points are not cchanging
    if all (labels ==previous_labels):
      break
  return centroids, labels

In [None]:
def plot_kmeans(X,centroids, labels, rseed=0, ax=None):
  #plot input data

  ax = ax or plt.gca() 
  ax.axis('equal') 

  #plot data points

  ax.scatter(X[:,0],X[:,1],c=labels, s=40, cmap='plasma', zorder=2)

  #plot the representation of the KMeans model.

  radii = [cdist(X[labels==i],[center]).max() for i, center in enumerate(centroids)] 

  for c,r in zip(centroids, radii):
    #add circular shapes to the clusters
    ax.add_patch(plt.Circle(c,r, fc='#CCCCCC',lw =5, alpha=0.5,zorder=1))               

###Week2:

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
def fit(X,y):
  return np.linalg.pinv(X)@y
def loss(X,y,w):
  e = X@w -y
  return 1/2 * e.T@e 

def calculate_gradient(X,y,w):
  return X.T@(X@w-y)

def update_weights(w,grad,lr):
  return (w - lr*grad)


def gradient_descent(X,y,lr,num_epochs):
  w_all = [] 
  err_all = []
  
  w = np.zeros((X.shape[1])) 
  for i in np.arange(0,num_epochs): 
    w_all.append(w) 
    err_all.append(loss(X,y,w))
    grad = calculate_gradient(X,y,w) 

    if (i%100==0):
      print('Iteration {0}#, loss {1:.2f} : '.format(i,err_all[-1])) 
      w = update_weights(w,grad,lr) 
  return w, err_all, w_all 

import numpy as np
from sklearn.model_selection import train_test_split
def generate_data(n=1000_000):
  w=np.array([4,3])
  X=10*np.random.rand(n)
  X=add_dummy_feature(X)
  noise=np.random.rand(n)
  y= X@w + noise
  return X,y 

def preprocess(X,y):
  X_train,y_train,X_test,y_test = train_test_split(X,y, test_size=0.2, random_state=42) 
  return X_train, y_train,X_test,y_test

def add_dummy_feature(X):
  return np.column_stack((np.ones(X.shape[0]),X))

def predict(X,w):
  return X@w

def plot_learning_curves(err_all):
  plt.plot(err_all)
  plt.xlabel('Iterations #') 
  plt.ylabel('Loss: $J(\mathbf{w})$')

def learning_schedule(t):
  t0,t1 = 200,100000 
  return t0/(t1+t) 

def mbgd(X,y,num_epochs,batch_size):
  w_all =[]
  err_all =[] 
  w = np.zeros((X.shape[1])) 
  t = 0 
  for epoch in range(num_epochs):
    shuffled_indices = np.random.permutation(X.shape[0]) 
    X_shuffled = X[shuffled_indices] 
    y_shuffled = y[shuffled_indices] 

    for i in range(0,X.shape[0],batch_size):
      t+=1
      xi= X_shuffled[i:i+batch_size]
      yi= y_shuffled[i:i+batch_size]

      err_all.append(loss(xi,yi,w))

      grad = 2/batch_size * calculate_gradient(xi,yi,w)
      lr = learning_schedule(t) 
      w = update_weights(w,grad,lr) 
      w_all.append(w) 
  return w ,err_all, w_all 

def sgd(X,y,num_epochs):
  w = np.zeros((X.shape[1])) 
  w_all = []
  err_all =[] 
    
  for epoch in range(num_epochs):
    for i in range(X.shape[0]):
      random_index = np.random.randint((X.shape[0])) 
      xi = X[random_index:random_index+1]
      yi = y[random_index:random_index+1] 
      err_all.append(loss(xi,yi,w))
      w_all.append(w)
      grad = 2* calculate_gradient(xi,yi,w)
      lr = learning_schedule(epoch*X.shape[0]+i)
      w = update_weights(w,grad,lr) 

  return w, err_all,w_all

      

In [None]:
class LinReg():
    def __init__(self):
        self.t0 = 200
        self.t1 = 100000
    def predict (self, X):
        y = X @ self.w
        return y
    def loss (self, X, y):
        e = y - self.predict(X)
        return 0.5 *(e.T @ e)
    def rmse(self,X, y):
        return np.sqrt(2/X.shape[0] * self.loss(X, y))
    def fit(self, X, y):
        self.w = np.linalg.pinv(X) @ y
        return self.w
    def calculate_gradient(self, X, y):
        return X.T @ (self.predict(X) - y)
    def update_weights(self, grad, lr):
        return (self.w - lr * grad)
    def learning_schedule(self, t):
        return self.t0 / (self.t0 + self.t1)
    def gd(self, X, y, num_epochs, lr):
        self.w = np.zeros(X.shape[1])
        self.w_all = list() 
        self.err_all = list()
        for i in range(epochs):
            dJdw = calculate_gradient(X, y)
            self.w_all.append(self.w)
            self.err_all.append(self.loss(X, y))
            self.w = self.update_weights(dJdw, lr)
        return self.w
    def mbgd(self, X, y, num_epochs, batch_size):
        mini_batch_id = 0
        self.w = np.zeros(X.shape[1])  #initializing arbitrary values.
        self.w_all = list() 
        self.err_all = list()

        for epoch in range(num_epochs):
            shuffled_indices = np.random.permutation(X.shape[0])
            X_shuffled = X[shuffled_indices]
            y_shuffled = y[shuffled_indices]

            for i in range(0, X.shape[0], batch_size):
                mini_batch_id += 1
                x1 = X_shuffled[i:i+batch_size]
                y1 = y_shuffled[i:i+batch_size]

                self.w_all.append(self.w)
                self.err_all.append(self.loss(X, y))

                dJdw = 2/batch_size * self.calculate_gradient(x1, y1)
                self.w = self.update_weights(dJdw, self.learning_schedule(mini_batch_id))

        return self.w
    
    def sgd(self, X, y, num_epochs):
        batch_size = 1
        mini_batch_id = 0
        self.w = np.zeros(X.shape[1])  #initializing arbitrary values.
        self.w_all = list() 
        self.err_all = list()

        for epoch in range(num_epochs):
            shuffled_indices = np.random.permutation(X.shape[0])
            X_shuffled = X[shuffled_indices]
            y_shuffled = y[shuffled_indices]

            for i in range(0, X.shape[0], batch_size):
                mini_batch_id += 1
                x1 = X_shuffled[i:i+batch_size]
                y1 = y_shuffled[i:i+batch_size]

                self.w_all.append(self.w)
                self.err_all.append(self.loss(X, y))

                dJdw = 2/batch_size * self.calculate_gradient(x1, y1)
                self.w = self.update_weights(dJdw, self.learning_schedule(mini_batch_id))

        return self.w

###Week3:

In [None]:
class LinReg():
    def __init__(self):
        self.t0 = 200
        self.t1 = 100000
    def predict (self, X):
        y = X @ self.w
        return y
    def loss (self, X, y, reg_rate):
        e = y - self.predict(X)
        return 0.5 *(e.T @ e) + (reg_rate/2) * (np.transpose(self.w) @ self.w)
    def rmse(self,X, y, reg_rate):
        return np.sqrt(2/X.shape[0] * self.loss(X, y, reg_rate))
    def fit(self, X, y, reg_rate):
#         self.w = np.linalg.pinv(X) @ y
        eye = np.eye(X.shape[1])
        self.w = np.linalg.solve(X.T @ X + reg_rate * eye, X.T @ y)
        return self.w
    def calculate_gradient(self, X, y, reg_rate):
        return X.T @ (self.predict(X) - y) + reg_rate * self.w
    def update_weights(self, grad, lr):
        return (self.w - lr * grad)
    def learning_schedule(self, t):
        return self.t0 / (self.t0 + self.t1)
    def gd(self, X, y, num_epochs, lr, reg_rate):
        self.w = np.zeros(X.shape[1])
        self.w_all = list() 
        self.err_all = list()
        for i in range(num_epochs):
            dJdw = self.calculate_gradient(X, y, reg_rate)
            self.w_all.append(self.w)
            self.err_all.append(self.loss(X, y, reg_rate))
            self.w = self.update_weights(dJdw, lr)
        return self.w
    def mbgd(self, X, y, num_epochs, batch_size):
        mini_batch_id = 0
        self.w = np.zeros(X.shape[1])  #initializing arbitrary values.
        self.w_all = list() 
        self.err_all = list()

        for epoch in range(num_epochs):
            shuffled_indices = np.random.permutation(X.shape[0])
            X_shuffled = X[shuffled_indices]
            y_shuffled = y[shuffled_indices]

            for i in range(0, X.shape[0], batch_size):
                mini_batch_id += 1
                x1 = X_shuffled[i:i+batch_size]
                y1 = y_shuffled[i:i+batch_size]

                self.w_all.append(self.w)
                self.err_all.append(self.loss(X, y))

                dJdw = 2/batch_size * self.calculate_gradient(x1, y1)
                self.w = self.update_weights(dJdw, self.learning_schedule(mini_batch_id))

        return self.w
    
    def sgd(self, X, y, num_epochs):
        batch_size = 1
        mini_batch_id = 0
        self.w = np.zeros(X.shape[1])  #initializing arbitrary values.
        self.w_all = list() 
        self.err_all = list()

        for epoch in range(num_epochs):
            shuffled_indices = np.random.permutation(X.shape[0])
            X_shuffled = X[shuffled_indices]
            y_shuffled = y[shuffled_indices]

            for i in range(0, X.shape[0], batch_size):
              mini_batch_id += 1
                x1 = X_shuffled[i:i+batch_size]
                y1 = y_shuffled[i:i+batch_size]

                self.w_all.append(self.w)
                self.err_all.append(self.loss(X, y))

                dJdw = 2/batch_size * self.calculate_gradient(x1, y1)
                self.w = self.update_weights(dJdw, self.learning_schedule(mini_batch_id))

        return self.w

### Week 4

In [None]:
def combinations(x, degree):
    return itertools.combinations_with_replacement(x, degree)

In [None]:
def compute_new_features(items):
    return functools.reduce(lambda x, y: x * y, items)

In [None]:
def polynomial_transform(x, degree):
    if x.ndim == 1:
        x = x[:, None]

    features = [np.ones(len(x))] #a list of np-arrays with all 1.

    for degree in range(1, degree + 1):
        for item in combinations(x.T, degree):
            features.append(compute_new_features(item))
            
    return np.asarray(features).T

In [None]:
def encode(arr):
    encoded = np.zeros((arr.size, arr.max() + 1))
    encoded[np.arange(arr.size), arr] = 1
    return encoded

In [None]:
def plot_learning_curve(err_all):
    err = [err[1][1] for err in err_all]
    plt.plot(np.arange(len(err)), err, 'r-')

In [None]:
def preprocess(add_class=False, add_outliers=False, degree=1):
    x, y = create_toy_data(add_outliers, add_class)
    x_poly = polynomial_transform(x, degree=degree)
    x_train, x_test, y_train, y_test = train_test_split(x_poly, y) 
    y_train_trans = LabelTransformer().encode(y_train)
    y_test_trans = LabelTransformer().encode(y_test)
    return x_train , x_test, y_train, y_test, y_train_trans, y_test_trans 

In [None]:
def create_toy_data(add_outliers=False, add_class=False):
    x0 = np.random.normal(size=50).reshape(-1,2) - 1
    x1 = np.random.normal(size=50).reshape(-1,2) + 1
    if add_outliers:  # add 5 (pairs of (x, y)) outliers
        x_1 = np.random.normal(size=10).reshape(-1,2) + np.array([5., 10.])
        return np.concatenate((x0, x1, x_1)), np.concatenate((np.zeros(25), np.ones(30))).astype(int)
    if add_class: #add 25 (pairs of (x, y) additional data)
        x2 = np.random.normal(size=50).reshape(-1,2) + 2
        return np.concatenate((x0, x1, x2)), np.concatenate((np.zeros(25), np.ones(25), 2 + np.zeros(25))).astype(int)
    return np.concatenate((x0, x1)), np.concatenate((np.zeros(25), np.ones(25))).astype(int)

In [None]:
sns.set_context(context='notebook',font_scale=1.5,rc={"lines.linewidth":2.5})

def visualize_model(X_train, labels, lsc, degree=1):
#     sns.set_style("white")
    f = plt.figure(figsize=(8,8))

    x1_min = np.min(x_train[:,1])
    x1_max = np.max(x_train[:,1])
    x2_min = np.min(x_train[:,2])
    x2_max = np.max(x_train[:,2])
    
    x1_test, x2_test = np.meshgrid(np.linspace(x1_min, x1_max, 100), np.linspace(x2_min, x2_max, 100)) 
    x_test = np.array([x1_test, x2_test]).reshape(2,-1).T
    x_test_poly = polynomial_transform(x_test, degree=degree)
    y_test = lsc.predict(x_test_poly)
    
    sns.scatterplot(data=x_train, x=x_train[:,1], y=x_train[:,2], hue=labels)
    plt.contourf(x1_test, x2_test, y_test.reshape(100,100), alpha=0.5, levels=np.linspace(0,1,3))
    plt.gca().set_aspect('equal', adjustable='box')

In [None]:
class LeastSquareClassification(object):
  def __init__(self):
    self.t0=20
    self.t1=1000
  
  def predict(self,X):
    return np.argmax(X @ self.w, axis=-1)

  def predict_internal(self,X):
    return X @ self.w
  
  def loss(self,X,y,reg_rate):
    y_hat = self.predict_internal(X)
    err = y_hat - y
    return (1/2) * (err.T @ err) + (reg_rate / 2) * (self.w.T @ self.w)

  def fit(self,X, y, reg_rate=0):
    self.w = np.linalg.solve(X.T @ X + reg_rate * np.eye(X.shape[-1]), X.T@y)
    print(X.shape, y.shape, self.w.shape)
    return self.w
  
  def calculate_gradient(self, X, y, reg_rate):
    y_hat = self.predict_internal(X)
    return X.T @ (y_hat - y) + reg_rate * self.w
  
  def weight_updates(self, grad,lr):
    return (self.w - lr*grad)
  
  def learning_schedule(self, t):
    return self.t0 / (t + self.t1)
  
  def gd(self, X, y, num_epochs, lr, reg_rate):
    self.w = np.zeros((X.shape[-1], y.shape[-1]))
    self.w_all = []
    self.err_all = []
    for i in np.arange(0, num_epochs):
      djdw = self.calculate_gradient(X, y, reg_rate)
      self.w_all.append(self.w)
      self.err_all.append(self.loss(X, y, reg_rate))
      self.w = self.weight_updates(djdw, lr)
    return self.w
  
  def sgd(self, X, y, num_epochs, reg_rate):
    self.err_all = []
    self.w_all = []
    self.w=np.zeros((X.shape[-1], y.shape[-1]))
    t=0
    for epoch in range(num_epochs):
      for iter in range(X.shape[0]):
        t = t+1
        random_index = np.random.randint(X.shape[0])
        x1 = X[random_index:random_index+1]
        y1 = y[random_index:random_index+1]

        self.w_all.append(self.w)
        self.err_all.append(self.loss(x1, y1, reg_rate))

        djdw = self.calculate_gradient(x1, y1, reg_rate)
        self.w = self.weight_updates(djdw, self.learning_schedule(t))
    return self.w

In [None]:
y_test_hat = lsc.predict(x_test)

tp = np.where((y_test == 1) & (y_test_hat == 1), 1, 0).sum()
tn = np.where((y_test == 0) & (y_test_hat == 0), 1, 0).sum()
fp = np.where((y_test == 0) & (y_test_hat == 1), 1, 0).sum()
fn = np.where((y_test == 1) & (y_test_hat == 0), 1, 0).sum()

def precision(tp, fp):
  if (tp + fp) == 0:
    return NaN
  return tp / (tp + fp)
def recall(tp,fn):
  if (tp+fn) ==0:
    return NaN
  return tp / tp +fn
def f1_score(pr,re):
  return 2 * ((pr * re)/(pr + re))
def accuracy(tp,fp,fn,tn):
  return (tp + tn) / (tp + tn + fp + fn)
