This notebook is based on Andrew Ng's machine learning course on coursera

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight') #certain display format and you can ignore this line of code
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

# Logistic Regression

## Data preparation

In [None]:
data = pd.read_csv('ex2data1.txt', names=['exam1', 'exam2', 'admitted'])
data.head()

In [None]:
data.describe()

In [None]:
sns.set(context="notebook", style="darkgrid", palette=sns.color_palette("RdBu", 2))

sns.lmplot('exam1', 'exam2', hue='admitted', data=data, 
           height=6, 
           fit_reg=False, 
           scatter_kws={"s": 50}
          )
plt.show() #let's see how these datapoints distribute

In [None]:
def get_X(df): #Feature matrix
    ones = pd.DataFrame({'ones': np.ones(len(df))})
    data = pd.concat([ones, df], axis=1)  #add one column of all ones as x0
    return data.iloc[:, :-1].to_numpy()  


def get_y(df): #Target variable -> last column
    return np.array(df.iloc[:, -1])#df.iloc[:, -1]


def normalize_feature(df): #Feature normalization
    return df.apply(lambda column: (column - column.mean()) / column.std())

In [None]:
X = get_X(data)
print(X.shape)

y = get_y(data)
print(y.shape)

## sigmoid function
The most common link function for logistic regression is sigmoid function and can be expressed as:
$$g(z)=\frac{1}{1+{{e}^{-z}}}$$ 
Therefore, the logitic regression model can be expressed as:
$${{h}_{\theta}}(x)=\frac{1}{1+{{e}^{-{{\theta}^{T}}X}}}$$ 


In [None]:
def sigmoid(z):
    # your code here  (appro ~ 1 lines)

    return gz

Let's plot and see the sigmoid function

In [None]:
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(np.arange(-10, 10, step=0.01),
        sigmoid(np.arange(-10, 10, step=0.01)))
ax.set_ylim((-0.1,1.1))
ax.set_xlabel('z', fontsize=18)
ax.set_ylabel('g(z)', fontsize=18)
ax.set_title('sigmoid function', fontsize=18)
plt.show()

## cost function
Here we choose the commonly used binary cross entropy as the cost function of the logistic regression:

$$J(\theta)=-\frac{1}{m}\sum_{i=1}^{m}{[{{y}^{(i)}}\log({{h}_{\theta}}({{x}^{(i)}}))+(1-{{y}^{(i)}})\log(1-{{h}_{\theta}}({{x}^{(i)}}))]}$$
$$J(\theta)==\frac{1}{m}\sum_{i=1}^{m}{[-{{y}^{(i)}}\log({{h}_{\theta}}({{x}^{(i)}}))-(1-{{y}^{(i)}})\log(1-{{h}_{\theta}}({{x}^{(i)}}))]}$$


In [None]:
theta = theta=np.zeros(3) # X(m*n) so theta is n*1
theta

In [None]:
def cost(theta, X, y):
    # your code here  (appro ~ 2 lines)

    
    return costf

In [None]:
cost(theta, X, y)

The output should be 0.6931471805599453

## batch gradient descent 
$$\frac{\partial J(\theta)}{\partial{\theta_{j}}} = \frac{1}{m} X^T( Sigmoid(X\theta) - y )$$


In [None]:
def gradient(theta, X, y):
    # your code here  (appro ~ 2 lines)

    
    return grad

In [None]:
gradient(theta, X, y)

## Optimization
> * Here I use the [`scipy.optimize.minimize`](http://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.minimize.html#scipy.optimize.minimize) to find the parameters


In [None]:
import scipy.optimize as opt

In [None]:
res = opt.minimize(fun=cost, x0=theta, args=(X, y), method='Newton-CG', jac=gradient)

In [None]:
print(res)

## Predication and evaluation on training set

In [2]:
def predict(x, theta):
    # your code here  (appro ~ 2 lines)

    
    return y_pred

In [None]:
final_theta = res.x #res.x is the theta after optimization
y_pred = predict(X, final_theta)

print(classification_report(y, y_pred))

## Find the decision boundary
http://stats.stackexchange.com/questions/93569/why-is-logistic-regression-a-linear-classifier
> $X \times \theta = 0$  (this is the line)

In [None]:
print(res.x) # this is final theta

In [None]:
coef = -(res.x / res.x[2]) 
print(coef)

x = np.arange(130, step=0.1)
y = coef[0] + coef[1]*x #decision boundary

In [None]:
data.describe()  # find the range of x and y

In [None]:
sns.set(context="notebook", style="ticks", font_scale=1.5)

sns.lmplot('exam1', 'exam2', hue='admitted', data=data, 
           size=6, 
           fit_reg=False, 
           scatter_kws={"s": 25}
          )

plt.plot(x, y, 'grey')
plt.xlim(0, 130)
plt.ylim(0, 130)
plt.title('Decision Boundary')
plt.show()

#  regularized logistic regression

In [None]:
df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
df.head()

In [None]:
sns.set(context="notebook", style="ticks", font_scale=1.5)

sns.lmplot('test1', 'test2', hue='accepted', data=df, 
           size=6, 
           fit_reg=False, 
           scatter_kws={"s": 50}
          )

plt.title('Regularized Logistic Regression')
plt.show()

## feature mapping

polynomial expansion

```
for i in 0..i
  for p in 0..i:
    output x^(i-p) * y^p
```
$$mapFeature(x) =[1,x_{1},x_{2},x_{1}^{2},x_{1}x_{2},x_{2}^{2},x_{1}^{3},...,x_{1}x_{2}^{5},x_{2}^{6}]$$

In [None]:
def feature_mapping(x, y, power, as_ndarray=False):
    #return mapped features as ndarray or dataframe

    data = {"f{}{}".format(i - p, p): np.power(x, i - p) * np.power(y, p)
                for i in np.arange(power + 1)
                for p in np.arange(i + 1)
            }

    if as_ndarray:
        return pd.DataFrame(data).to_numpy()
    else:
        return pd.DataFrame(data)


In [None]:
x1 = np.array(df.test1)
x2 = np.array(df.test2)

In [None]:
data = feature_mapping(x1, x2, power=6)
print(data.shape)
data.head()

In [None]:
data.describe()

## regularized cost
$$J(\theta)=\frac{1}{m}\sum_{i=1}^{m}{[-{{y}^{(i)}}\log({{h}_{\theta}}({{x}^{(i)}}))-(1-{{y}^{(i)}})\log(1-{{h}_{\theta}}({{x}^{(i)}}))]}+\frac{\lambda}{2m}\sum_{j=1}^{n}{\theta_{j}^{2}}$$

In [None]:
theta = np.zeros(data.shape[1])
X = feature_mapping(x1, x2, power=6, as_ndarray=True)
print(X.shape)

y = get_y(df)
print(y.shape)

In [None]:
def regularized_cost(theta, X, y, l=1):
    # your code here  (appro ~ 3 lines)

    
    
    return regu_cost


In [None]:
regularized_cost(theta, X, y, l=1)

Because thetas are all zero, so this value should be also 0.6931471805599461

## regularized gradient
$$\frac{\partial J(\theta)}{\partial{{\theta}_{j}}}=(\frac{1}{m}\sum_{i=1}^{m}{({{h}_{\theta}}({{x}^{(i)}})-{{y}^{(i)}})})+\frac{\lambda}{m}{{\theta}_{j}}\text{ }\text{             for  j}\ge \text{1}$$

In [None]:
def regularized_gradient(theta, X, y, l=1):
    # your code here  (appro ~ 2 lines)

    
    return gradient(theta, X, y) + regularized_term

In [None]:
regularized_gradient(theta, X, y)

## Optimization

In [None]:
import scipy.optimize as opt

In [None]:
print('init cost = {}'.format(regularized_cost(theta, X, y)))

res = opt.minimize(fun=regularized_cost, x0=theta, args=(X, y), method='Newton-CG', jac=regularized_gradient)
res

## Prediction

In [None]:
final_theta = res.x
y_pred = predict(X, final_theta)

print(classification_report(y, y_pred))

## Use different $\lambda$ 
## Plot the decision boundary
* $X\times \theta = 0$
* instead of solving polynomial equation, just create a x,y grid that is dense enough, and find all those $X\times \theta$ that are close enough to 0, then plot them

In [None]:
def draw_boundary(power, l):
#     power: polynomial power for mapped feature
#     l: lambda constant

    density = 1000
    threshhold = 2 * 10**-3

    final_theta = feature_mapped_logistic_regression(power, l)
    x, y = find_decision_boundary(density, power, final_theta, threshhold)

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})

    plt.scatter(x, y,color='red' , s=10)
    plt.title('Decision boundary')
    plt.show()

In [None]:
def feature_mapped_logistic_regression(power, l):
#     for drawing purpose only. not a well generalized logistic regression
#     power: raise x1, x2 to polynomial power
#     l: lambda constant for regularization term

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    x1 = np.array(df.test1)
    x2 = np.array(df.test2)
    y = get_y(df)

    X = feature_mapping(x1, x2, power, as_ndarray=True)
    theta = np.zeros(X.shape[1])

    res = opt.minimize(fun=regularized_cost,
                       x0=theta,
                       args=(X, y, l),
                       method='TNC',
                       jac=regularized_gradient)
    final_theta = res.x

    return final_theta

In [None]:
def find_decision_boundary(density, power, theta, threshhold):
    t1 = np.linspace(-1, 1.5, density)
    t2 = np.linspace(-1, 1.5, density)

    cordinates = [(x, y) for x in t1 for y in t2]
    x_cord, y_cord = zip(*cordinates)
    mapped_cord = feature_mapping(x_cord, y_cord, power)  # this is a dataframe

    inner_product = mapped_cord.to_numpy() @ theta

    decision = mapped_cord[np.abs(inner_product) < threshhold]

    return decision.f10, decision.f01


## Change the value of $\lambda$ and see the result

In [None]:
draw_boundary(power=6, l=1)     #set lambda = 1

In [None]:
draw_boundary(power=6, l=0.01)  # set lambda < 0.1

In [None]:
draw_boundary(power=6, l=100)  # set lambda > 10