# First linear regression example

In [7]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import scale
boston = load_boston()
X, y = scale(boston.data), boston.target


from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X,y)

print ('R2 %0.3f' % regression.score(X,y))

R2 0.741


In [8]:
print ([a + ':' + str(round(b,1)) for a, b in zip(boston.feature_names, regression.coef_)])

['CRIM:-0.9', 'ZN:1.1', 'INDUS:0.1', 'CHAS:0.7', 'NOX:-2.1', 'RM:2.7', 'AGE:0.0', 'DIS:-3.1', 'RAD:2.7', 'TAX:-2.1', 'PTRATIO:-2.1', 'B:0.9', 'LSTAT:-3.7']


# Encoding qualitative data rather than quantitative
## one-hot encoding
Essentially takes qualitative data such as color and binary encodes it so we can do some math or tranformations with it in the future

In [9]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
lbl = LabelEncoder()
enc = OneHotEncoder()
qualitative = ['red','red', 'green', 'blue', 'red', 'blue', 'blue', 'green']
labels = lbl.fit_transform(qualitative).reshape(8,1)
print(enc.fit_transform(labels).toarray())

[[0. 0. 1.]
 [0. 0. 1.]
 [0. 1. 0.]
 [1. 0. 0.]
 [0. 0. 1.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]]


# Creating a polynomial regression model
### Same thing as linear regression just expanding the linear function to include curves for more complex problems

In [10]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

# creates a polynomail of degree 2 rather than having a linear model
# Linear : y = b1x1 + b2x2 + a
# Polynomial y = b1x1 + b2x2 + a + b3x1^2 + b4x2^2 + b5x1x2
pf = PolynomialFeatures(degree=2)
poly_X = pf.fit_transform(X)
X_train, X_test, y_train, y_test = ( 
    train_test_split(poly_X,
                     y, test_size=.33, random_state=42))
from sklearn.linear_model import Ridge
reg_regression = Ridge(alpha = .1, normalize = True)
reg_regression.fit(X_train, y_train)
print('R2: %0.3f' % r2_score(y_test,reg_regression.predict(X_test)))
print (y)
# As you can see our R2 value went from .741 to .819

R2: 0.819
[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.4 19.8 19.4 21.7 22.8
 18.8 18.7 18.5 18.3 21.2 19.2 20.4 19.3 22.  20.3 20.5 17.3 18.8 21.4
 15.7 16.2 18.  14.3 19.2 19.6 23.  18.4 15.6 18.1 17.4 17.1 13.3 17.8
 14.  14.4 13.4 15.6 11.8 13.8 15.6 14.6 17.8 15.4 21.5 19.6 15.3 19.4
 17.  15.6 13.1 41.3 24.3 23.3 27.  50.  50.  50.  22.7 25.  50.  23.8
 23.8 22.3 17.4 19.1 23.1 23.6 22.6 29.4 23.2 24.6 29.9 37.2 39.8 36.2
 37.9 32.5 26.4 29.6 50.  32.  29.8 34.9 37.  30.5 36.4 31.1 29.1 5

# Classification using binary responses

In [11]:
import numpy as np

a = np.array([0,0,0,0,1,1,1,1])
b = np.array([1,2,3,4,5,6,7,8]).reshape(8,1)
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(b,a)
print(regression.predict(b)>.5)


[False False False False  True  True  True  True]


In [12]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

binary_y = np.array(y >= 40).astype(int)
X_train, X_test, y_train, y_test = train_test_split(X,binary_y, test_size=.33, random_state=5)
logistic = LogisticRegression()
logistic.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
print('In-sample accuracy: %0.3f' % accuracy_score(y_train, logistic.predict(X_train)))
print ('Out-of-sample accuracy: %0.3f' %accuracy_score(y_test, logistic.predict(X_test)))

In-sample accuracy: 0.973
Out-of-sample accuracy: 0.958


In [13]:
for var,coef in zip(boston.feature_names, logistic.coef_[0]):
    print("%7s : %7.3f" %(var,coef))

   CRIM :  -0.006
     ZN :   0.197
  INDUS :   0.580
   CHAS :  -0.023
    NOX :  -0.236
     RM :   1.426
    AGE :  -0.048
    DIS :  -0.365
    RAD :   0.645
    TAX :  -0.220
PTRATIO :  -0.554
      B :   0.049
  LSTAT :  -0.803


In [14]:
print('\nclasses:', logistic.classes_)
print('\nProbs:\n', logistic.predict_proba(X_test)[:3,:])


classes: [0 1]

Probs:
 [[0.39022779 0.60977221]
 [0.93856655 0.06143345]
 [0.98425623 0.01574377]]


# Adding poor/random features can inflate the R2 value greatly

In [15]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state = 42)
check = [2**i for i in range(8)]
for i in range (2**7+1):
    X_train = np.column_stack((X_train, np.random.random(X_train.shape[0])))
    X_test = np.column_stack((X_test, np.random.random(X_test.shape[0])))
    regression.fit(X_train, y_train)
    if i in check:
        print("Random features: %i -> R2: %0.3f" % (i, r2_score(y_train, regression.predict(X_train))))


Random features: 1 -> R2: 0.741
Random features: 2 -> R2: 0.741
Random features: 4 -> R2: 0.742
Random features: 8 -> R2: 0.744
Random features: 16 -> R2: 0.757
Random features: 32 -> R2: 0.770
Random features: 64 -> R2: 0.784
Random features: 128 -> R2: 0.840


## This is just an illusion, let's check the real R2 value on test data

In [16]:
regression.fit(X_train, y_train)
print('R2: %0.3f' % r2_score(y_test, regression.predict(X_test)))

R2: 0.498


# Solving overfitting using selection and regularization
#### L1 Regularization is when you use the coefficients absolute value
#### L2 Regularization is when you use postive and negative values so they can't cancel out
L2 keeps all of the features in the model and balances the contribution of each of them. This will show correlation well.
L1 brings highly correlated features out of the model by making their coefficients zero. This essentially excludes the feature from the model. 

In [17]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

pf = PolynomialFeatures(degree=2)
poly_X = pf.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(poly_X, y, test_size=0.33, random_state = 42)

from sklearn.linear_model import Ridge
reg_regression = Ridge(alpha=.1, normalize=True)
reg_regression.fit(X_train,y_train)
print('R2: %0.3f' % r2_score(y_test, reg_regression.predict(X_test)))

R2: 0.819


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.33, random_state=42)
SGD = SGDRegressor(penalty=None, learning_rate = 'invscaling', eta0=.01, power_t=.25, max_iter=5, tol=None)

power = 17
check = [2**i for i in range(power+1)]
for i in range(400):
    for j in range(X_train, shape[0]):
        SGD.partial_fit(X_train[j,:].reshape(1,13),y_train[j].reshape(1,))
        count = (j+1) + X_train.shape[0] * i
        if count in check:
            R2 = r2_score(y_test,SGD.predict(X_test))
            print('Example %6i R2 %0.3f coef: %s' % (count, R2, ' '.join(map(lambda x: '%0.3f' % x, SGD.Coef_))))

NameError: name 'shape' is not defined

# No matter the amount of data, you can always fit a simple but effective linear regression model using SGD online learning capabilities