# ML Coding Problems 
- Author: Alireza Dirafzoon
- Work in progress; Contributions are welcome :) 

In [1]:
import numpy as np
import random, collections

from matplotlib import pyplot as plt 
%matplotlib inline 

## Kmeans Clustering

In [8]:
def _clsuter(X,mus):
    clusters = collections.defaultdict(list)
    for x in X: 
        mu_idx, _ = min([(i,get_dist(x,mu)) for i,mu in enumerate(mus)], key=lambda pair:pair[1])
        clusters[mu_idx].append(x)
    return clusters
        
def recalc_centers(clusters):
    mus = []
    keys = sorted(clusters.keys())
    for k in keys: 
        mus.append(np.mean(clsuters[k], axis=0))
    return mus
    

def kmeans(X, k):
    mu = X[np.random.choice(X.shape[0],k, False)]
    pre_mu = mu + 2
    
    it  = 0 
    while it < 10 and not has_converged(mu, pre_mu):
        pre_mu = mu 
        clusters = _cluster(X, mu)
        mu = recalc_centers(clusters)
        it += 1 
    return (mu, clusters)
    

def _cluster(X, mus):
    clusters = collections.defaultdict(list)
    for x in X:
        dists = [np.linalg.norm(x-mu) for mu in mus]
        min_idx = dists.index(min(dists))
        clusters[min_idx].append(x)
    return clusters

def _calc_centers(mu, clusters):
    mus = []
    keys = sorted(clusters.keys())
    for k in keys:
        mus.append(np.mean(clusters[k], axis = 0))
    return mus

def has_converged(mu, oldmu):
    #ToDo
    # return (set([tuple(a) for a in mu]) == set([tuple(a) for a in oldmu]))
    return False 


def kmeans_fit(X, K):
    # Initialize to K random centers
    mus = X[np.random.choice(X.shape[0],k, False)]
    pre_mus = mus + 2
    max_it, it = 10, 1 

    while it < max_it and not has_converged(mus, pre_mus):
        pre_mus  = mus
        # Assign all points in X to clusters
        clusters = _cluster(X, mus)
        # Reevaluate centers
        mus = _calc_centers(mus, clusters)
        it += 1 
        
    return(mus, clusters)

def kmeans_predict():
    #todo 
    pass 

x1 = np.random.randn(5,2) + 5
x2 = np.random.randn(5,2) - 5
X = np.concatenate([x1,x2], axis=0)
k = 2
mus, clusters = kmeans_fit(X, k)
print(mus)
print(clusters)

[array([4.75401264, 4.99535506]), array([-4.93248421, -4.73085676])]
defaultdict(<class 'list'>, {0: [array([3.90785045, 6.55510632]), array([5.8373926, 2.8878767]), array([4.73828244, 4.41468663]), array([5.02741639, 6.27032238]), array([4.25912132, 4.84878326])], 1: [array([-5.74522008, -5.78916022]), array([-5.16147393, -5.03389587]), array([-3.87095411, -3.12125983]), array([-5.4299055 , -5.75957079]), array([-4.4548674 , -3.95039709])]})


Follow ups: 
- computattional complexity: O(it * k*n*d)
- improve space: use index instead of copy 
- improve time: 
    - dim reduction 
    - subsample (cons?)
- mini-batch
- k-median 
https://mmuratarat.github.io/2019-07-23/kmeans_from_scratch

## Linear Regression 

In [3]:
# https://en.wikipedia.org/wiki/Simple_linear_regression

In [9]:
class LinearRegressor: 
    def __init__(self):
        self.m = 0 
        self.b = 0 

    def _mean(self,X):
        return np.mean(X)
    def SS(self, X, Y):
        return sum((X-Y)**2)
    
    def fit(self, X, Y):
#         self.m = (np.mean(X) * np.mean(Y) - np.mean(X*Y)) / (np.mean(X)**2 - np.mean(X**2))
        self.m = sum((X - np.mean(X))*(Y - np.mean(Y))) / sum((X - np.mean(X))**2)
        self.b = np.mean(Y) - self.m * np.mean(X) 
        
    def coef(self, Y, Y_hat):
        return 1 - self.SS(Y,Y_hat) / self.SS(Y, np.mean(Y))
        
    def predict(self,X):
        return self.m * X + self.b 
    
X = np.linspace(0,10,10)
m, b  = 3, -2
Y = m * X + b + 0.1 * np.random.randn(X.shape[0])

lr = LinearRegressor()
lr.fit(X,Y)
Y_hat = lr.predict(X)
R2 = lr.coef(Y,Y_hat)

print(lr.m, lr.b)
print(R2)

2.9998703130892173 -1.9873208939401277
0.9999577341447001


## Logistic Regression 

In [10]:
x1 = np.random.randn(5,2) + 5
x2 = np.random.randn(5,2) - 5
X = np.concatenate([x1,x2], axis=0)
y = np.concatenate([np.ones(5),-np.ones(5)], axis=0).astype(np.int16)


def sigmoid(z):
    return 1.0 / (1 + np.exp(-z))

def cost_fcn(x,theta,y):
    h = sigmoid(np.dot(x,theta))
    J = (1.0/m) * np.sum( -y * np.log(h) - (1.0 - y) * np.log(1.0 - h))
    return J 

def gradients(x, theta, y):
    h = sigmoid(np.dot(x,theta))
    return (1.0 /m) * np.dot(x.T, (h-y))

def logistic_regression(X,y):
    max_it = 10
    alpha = 0.1
    cost = []
    theta = np.random.rand(X.shape[1])
    for it in range(max_it): 
        cost.append(cost_fcn(X,theta,y))
        grads = gradients(X,theta, y)
        theta = theta - alpha * grads
    pred = np.dot(X,theta)
    pred[pred >= 0.5] = 1
    pred[pred < 0.5] = 0
    print(pred)
    
    
x1 = np.random.randn(5,2) + 5
x2 = np.random.randn(5,2) - 5
X = np.concatenate([x1,x2], axis=0)    
y  = np.concatenate([np.ones(5), -np.zeros(5)], axis=0)   
logistic_regression(X,y)

[1. 1. 1. 1. 1. 0. 0. 0. 0. 0.]


## Decision Trees 

In [11]:
# https://github.com/random-forests/tutorials/blob/master/decision_tree.py

## KNN 

In [12]:
# https://github.com/madhug-nadig/Machine-Learning-Algorithms-from-Scratch/blob/master/K%20Nearest%20Neighbours.py

## FF NN 

In [13]:
# https://github.com/alirezadir/deep-learning/blob/master/first-neural-network/my_answers.py

## SVM 

In [14]:
# https://towardsdatascience.com/support-vector-machine-introduction-to-machine-learning-algorithms-934a444fca47

## Sampling multinomial distribution 

## Stratified sampling

In [16]:
#https://towardsdatascience.com/the-5-sampling-algorithms-every-data-scientist-need-to-know-43c7bc11d17c

## Random int in range 

In [17]:
# https://leetcode.com/discuss/interview-question/125347/generate-uniform-random-integer
# https://leetcode.com/articles/implement-rand10-using-rand7/

## Triangle closing 


## Meeting point + follow ups