In [1]:
import pandas as pd

df = pd.read_csv('housing.csv', sep="\s+")
df.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [2]:
import numpy as np
from sklearn.model_selection import train_test_split


features = df[["CRIM","ZN","INDUS","CHAS","NOX","RM","AGE","DIS","RAD","TAX","PTRATIO","B","LSTAT"]]
prices = df['MEDV']

#print(prices)
#print(features)

# splitting the dataframe into train and test sets
X_train,X_test,y_train,y_test = train_test_split(
  features,prices,test_size=0.30,random_state=57)

#print(X_test)


### Linear Regression

In [9]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
print(reg.score(X_train, y_train))
print(reg.coef_)
print(reg.intercept_)
pred = reg.predict(X_test)

score = reg.score(X_test, y_test)
print(score)

#print(np.vstack((pred,y_test.values)).T)

0.7388646397632683
[-1.10777604e-01  4.52134626e-02  3.81363044e-02  2.51757184e+00
 -1.83429585e+01  4.10455395e+00 -1.21264094e-02 -1.55004181e+00
  2.75418027e-01 -1.04526307e-02 -9.46944901e-01  6.12865467e-03
 -5.48969244e-01]
36.7890343362636
0.7353138568159987


### Polynomial Regression

In [8]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression

poly = PolynomialFeatures(degree=2, include_bias=False)
poly_features = poly.fit_transform(X_train)

poly_reg_model = LinearRegression()
poly_reg_model.fit(poly_features, y_train)

test_poly_features = poly.fit_transform(X_test)
y_predicted = poly_reg_model.predict(test_poly_features)

score = poly_reg_model.score(test_poly_features, y_test)
print(score)

#print(np.vstack((y_predicted,y_test.values)).T)

0.8746976220255396


In [27]:
from sklearn.preprocessing import scale
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

folds = KFold(n_splits = 10, shuffle = True, random_state = 100)

# step-2: specify range of hyperparameters to tune
hyper_params = [{'n_features_to_select': list(range(1, 14))}]


# step-3: perform grid search
# 3.1 specify model
lm = LinearRegression()
lm.fit(X_train, y_train)
rfe = RFE(lm)

# 3.2 call GridSearchCV()
model_cv = GridSearchCV(estimator = rfe,
                        param_grid = hyper_params,
                        scoring= 'r2',
                        cv = folds,
                        verbose = 1,
                        return_train_score=True)

# fit the model
model_cv.fit(X_train, y_train)

Fitting 10 folds for each of 14 candidates, totalling 140 fits


GridSearchCV(cv=KFold(n_splits=10, random_state=100, shuffle=True),
             estimator=RFE(estimator=LinearRegression()),
             param_grid=[{'n_features_to_select': [1, 2, 3, 4, 5, 6, 7, 8, 9,
                                                   10, 11, 12, 13, 14]}],
             return_train_score=True, scoring='r2', verbose=1)

In [28]:
cv_results = pd.DataFrame(model_cv.cv_results_)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_features_to_select,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,...,split2_train_score,split3_train_score,split4_train_score,split5_train_score,split6_train_score,split7_train_score,split8_train_score,split9_train_score,mean_train_score,std_train_score
0,0.014198,0.00426,0.002696,0.000638,1,{'n_features_to_select': 1},0.166317,0.294888,0.239434,0.065389,...,0.165113,0.180048,0.1748,0.171037,0.185811,0.162002,0.190277,0.172008,0.172092,0.010209
1,0.0152,0.004916,0.002902,0.000946,2,{'n_features_to_select': 2},0.366526,0.554087,0.790403,0.575035,...,0.508318,0.527203,0.506714,0.526405,0.571889,0.551647,0.512415,0.542829,0.531621,0.019558
2,0.012495,0.005143,0.002601,0.00102,3,{'n_features_to_select': 3},0.450767,0.502259,0.820038,0.599901,...,0.531052,0.550538,0.52945,0.555455,0.59012,0.57803,0.541063,0.564203,0.555306,0.018149
3,0.0107,0.002688,0.002298,0.000455,4,{'n_features_to_select': 4},0.531377,0.670496,0.797558,0.683637,...,0.599926,0.610771,0.590427,0.617221,0.659561,0.637768,0.601148,0.626075,0.617197,0.019113
4,0.009202,0.001832,0.003,0.00126,5,{'n_features_to_select': 5},0.555476,0.691897,0.737369,0.621166,...,0.621918,0.632725,0.605462,0.62882,0.671704,0.652404,0.617932,0.643605,0.633595,0.01774
5,0.010902,0.001814,0.0026,0.000663,6,{'n_features_to_select': 6},0.736166,0.708157,0.777945,0.75529,...,0.712722,0.715455,0.705317,0.71319,0.749315,0.732745,0.702732,0.738184,0.719991,0.014345
6,0.009207,0.00133,0.003094,0.001758,7,{'n_features_to_select': 7},0.719465,0.695793,0.775019,0.75852,...,0.715199,0.717462,0.706869,0.715224,0.750409,0.733976,0.70598,0.740035,0.722226,0.013842
7,0.008405,0.002007,0.002295,0.000461,8,{'n_features_to_select': 8},0.714166,0.671223,0.775634,0.758417,...,0.722086,0.724369,0.713732,0.723439,0.752675,0.738797,0.713807,0.74439,0.728667,0.012203
8,0.006892,0.001511,0.002499,0.000673,9,{'n_features_to_select': 9},0.71216,0.673564,0.76991,0.755378,...,0.723088,0.72516,0.714349,0.724586,0.752894,0.743385,0.714536,0.748083,0.730064,0.012842
9,0.006702,0.001735,0.0032,0.002963,10,{'n_features_to_select': 10},0.710441,0.681532,0.781997,0.756918,...,0.725956,0.728834,0.717969,0.727255,0.755867,0.743506,0.718483,0.74813,0.732804,0.01192


In [5]:
#import matplotlib.pyplot as plt
#import seaborn as sns

#plt.figure(figsize=(10, 6))
#plt.title("Your first polynomial regression – congrats! :)", size=16)
#plt.scatter(y_test, y_predicted)
#plt.plot(y_test, y_predicted, c="red")
#plt.show()

ModuleNotFoundError: No module named 'seaborn'

### Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing

lab = preprocessing.LabelEncoder()
y_train_transformed = lab.fit_transform(y_train)
y_test_transformed = lab.fit_transform(y_test)

logisticRegr = LogisticRegression()
logisticRegr.fit(X_train, y_train_transformed)

pred_LogReg  = logisticRegr.predict(X_test)
print(np.vstack((pred_LogReg,y_test_transformed)).T)


score = logisticRegr.score(X_test, y_test_transformed)
print(score)

[[ 97  73]
 [ 97  55]
 [ 31  18]
 [ 80  60]
 [ 97  84]
 [ 97  33]
 [ 13   3]
 [102  60]
 [ 70  46]
 [169 112]
 [ 86  90]
 [ 76  45]
 [ 97  58]
 [ 87  75]
 [ 87 103]
 [ 72  30]
 [115  50]
 [ 76  91]
 [ 80  48]
 [187  87]
 [ 80  38]
 [ 97  68]
 [150  99]
 [ 50  61]
 [ 72  42]
 [ 69  55]
 [ 38  24]
 [ 96  95]
 [147 106]
 [  5   6]
 [147 109]
 [ 45  28]
 [ 76  49]
 [ 97  53]
 [ 97  43]
 [ 43  78]
 [ 31  35]
 [106  97]
 [150  92]
 [ 59  37]
 [ 97  52]
 [ 13  11]
 [126  77]
 [ 75  65]
 [ 31   7]
 [126 105]
 [114  82]
 [187 118]
 [ 97  85]
 [ 43  34]
 [166  84]
 [100  96]
 [ 72  39]
 [ 76  70]
 [ 97  91]
 [102 106]
 [102  89]
 [ 34  47]
 [187 118]
 [116  54]
 [106 101]
 [  3  20]
 [100 104]
 [ 75  43]
 [180 118]
 [114  89]
 [ 87  74]
 [111  89]
 [147 100]
 [  5   8]
 [ 97  67]
 [ 73  51]
 [ 12  11]
 [166  63]
 [ 97  80]
 [ 80  72]
 [180 117]
 [ 97  59]
 [ 80  87]
 [ 28   4]
 [187 102]
 [ 72  26]
 [ 98  49]
 [ 86  32]
 [ 96  15]
 [164  81]
 [ 69  36]
 [ 73  72]
 [126  83]
 [ 43  22]
 [187  86]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
