# 1. Setting up

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

np.random.seed(416)

In [2]:
crime = pd.read_csv('Philadelphia_Crime_Rate_noNA.csv')
print(crime.head())

   HousePrice  HsPrc ($10,000)  CrimeRate  MilesPhila  PopChg        Name  \
0      140463          14.0463       29.7        10.0    -1.0    Abington   
1      113033          11.3033       24.1        18.0     4.0      Ambler   
2      124186          12.4186       19.5        25.0     8.0       Aston   
3      110490          11.0490       49.4        25.0     2.7    Bensalem   
4       79124           7.9124       54.1        19.0     3.9  Bristol B.   

     County  
0  Montgome  
1  Montgome  
2  Delaware  
3     Bucks  
4     Bucks  


# 2. Pre-processing

In [3]:
#One hot encoding all the county values
one_hot = pd.get_dummies(crime['County'])
#print(one_hot)

#We then concatinate the one hot columns with original dataset.
crime = pd.concat([crime, one_hot], axis=1)

# We need to drop any and all NaN values to filter our data. So we drop any rows that contain a NaN 
crime = crime.dropna()
print(crime)

#Now, let's try to find relevant features in our dataset
input_cols = ['HousePrice', 'MilesPhila', 'PopChg', 'Bucks', 'Chester', 'Delaware', 'Montgome', 'Phila']
output_col = 'CrimeRate'

    HousePrice  HsPrc ($10,000)  CrimeRate  MilesPhila  PopChg        Name  \
0       140463          14.0463       29.7        10.0    -1.0    Abington   
1       113033          11.3033       24.1        18.0     4.0      Ambler   
2       124186          12.4186       19.5        25.0     8.0       Aston   
3       110490          11.0490       49.4        25.0     2.7    Bensalem   
4        79124           7.9124       54.1        19.0     3.9  Bristol B.   
..         ...              ...        ...         ...     ...         ...   
94      174232          17.4232       13.8        25.0     4.7    Westtown   
95      196515          19.6515       29.9        16.0     1.8  Whitemarsh   
96      232714          23.2714        9.9        21.0     0.2  Willistown   
97      245920          24.5920       22.6        10.0     0.3   Wynnewood   
98      130953          13.0953       13.0        24.0     5.2     Yardley   

      County  Bucks  Chester  Delaware  Montgome  Phila  
0   M

In [4]:
train, test = train_test_split(crime, test_size=0.2)

train_X = train[input_cols]
train_y = train[output_col]

test_X = test[input_cols]
test_y = test[output_col]

In [5]:
print("train input data shape:", train_X.shape)
print("train target data shape:", train_y.shape)
print()
print("test input data shape:", test_X.shape)
print("test target data shape:", test_y.shape)

train input data shape: (78, 8)
train target data shape: (78,)

test input data shape: (20, 8)
test target data shape: (20,)


In [6]:
scaler = StandardScaler()

scaler.fit(train_X)
train_X_norm = scaler.transform(train_X)
test_X_norm = scaler.transform(test_X)

In [7]:
print("data type after normalizaton:", type(train_X_norm))
pd.DataFrame(train_X_norm)

data type after normalizaton: <class 'numpy.ndarray'>


Unnamed: 0,0,1,2,3,4,5,6,7
0,0.447448,0.347822,-0.906123,-0.467707,-0.467707,-0.587220,1.546384,-0.313993
1,0.510510,0.117418,-1.216501,-0.467707,-0.467707,-0.587220,1.546384,-0.313993
2,-0.692668,-2.186629,0.623599,-0.467707,-0.467707,-0.587220,-0.646670,3.184785
3,-0.424976,0.693429,0.823128,-0.467707,-0.467707,-0.587220,1.546384,-0.313993
4,-0.228859,1.384643,0.490580,2.138090,-0.467707,-0.587220,-0.646670,-0.313993
...,...,...,...,...,...,...,...,...
73,-0.487978,-0.112987,0.446240,-0.467707,-0.467707,-0.587220,1.546384,-0.313993
74,0.575603,1.960655,1.998131,-0.467707,2.138090,-0.587220,-0.646670,-0.313993
75,-1.351744,-1.725820,-1.704238,-0.467707,-0.467707,-0.587220,-0.646670,3.184785
76,-0.777229,-0.458594,-1.016972,-0.467707,-0.467707,1.702939,-0.646670,-0.313993


# 3. Regularization with Ridge

Create a [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) linear model with a regularization coefficent of 1. 

In [8]:
ridge_model = Ridge(alpha=1.0)

In [9]:
def rmse(model, X, y):
    predictions = model.predict(X)
    return np.sqrt(mean_squared_error(predictions, y))

In [10]:
ridge_model.fit(train_X_norm, train_y)
rmse(ridge_model, train_X_norm, train_y)

np.float64(32.9177734843289)

In [11]:
ridge_CV_scores = cross_val_score(ridge_model, train_X, train_y, cv=5, scoring=rmse)

In [12]:
for reg_coef in [0.0001, 0.1, 1, 10, 100, 1000, 10e4, 10e7]:
    ridge_model = Ridge(alpha=reg_coef)
    ridge_CV_scores = cross_val_score(ridge_model, train_X_norm, train_y, cv=5, scoring=rmse)
    print(reg_coef, ridge_CV_scores.mean(), sep='\t')

0.0001	37.50090263606724
0.1	37.43970007336747
1	36.91730282456718
10	33.56965319795065
100	29.62088364376266
1000	31.26781831761904
100000.0	32.008549856873906
100000000.0	32.017569248632505


In [13]:
print("Reg coeff. | ", "Intercept | ", input_cols)
print("_________________________________________________")

for reg_coef in [0.0001, 0.1, 1, 10, 100, 1000, 10e4, 10e7]:
    ridge_model = Ridge(alpha=reg_coef)
    ridge_model.fit(train_X_norm, train_y)
    print(reg_coef," | ", ridge_model.intercept_, " | ", ridge_model.coef_)
    print()

Reg coeff. |  Intercept |  ['HousePrice', 'MilesPhila', 'PopChg', 'Bucks', 'Chester', 'Delaware', 'Montgome', 'Phila']
_________________________________________________
0.0001  |  33.57435897435898  |  [ -3.63197189 -14.24752441  17.13681315  -2.23370832  -2.70466811
  -7.72951492   0.04520311  18.36731872]

0.1  |  33.57435897435898  |  [ -3.63608447 -14.19995246  17.07375856  -2.23448001  -2.69413453
  -7.71463392   0.03962798  18.34037168]

1  |  33.57435897435898  |  [-3.67137431e+00 -1.37880362e+01  1.65239914e+01 -2.23958735e+00
 -2.60158983e+00 -7.58471991e+00 -8.38937656e-03  1.81011014e+01]

10  |  33.57435897435898  |  [ -3.88521252 -10.83241873  12.38453933  -2.18627031  -1.87616968
  -6.57512436  -0.33331877  16.03148365]

100  |  33.57435897435898  |  [-3.10333377 -4.24249064  2.75480833 -1.31121125 -0.3460951  -3.10006416
 -0.64721913  7.99392311]

1000  |  33.57435897435898  |  [-0.70776377 -0.79471316  0.090652   -0.28442008 -0.07301084 -0.47373972
 -0.16754507  1.47097

In [14]:
linear_reg_model = LinearRegression()
linear_reg_model.fit(train_X_norm, train_y)
print(linear_reg_model.intercept_, linear_reg_model.coef_)

33.57435897435898 [ -3.63196775 -14.24757222  17.13687648  -2.23370753  -2.70467868
  -7.72952986   0.04520872  18.36734573]


# 4. Computing final test scores

In [15]:
print("Ridge", rmse(Ridge(alpha=100).fit(train_X_norm, train_y), test_X_norm, test_y))
print("LinearRegression", rmse(LinearRegression().fit(train_X_norm, train_y), test_X_norm, test_y))

Ridge 11.768449151887964
LinearRegression 32.32637112653597


In [16]:
print("Ridge", rmse(Ridge(alpha=100).fit(train_X, train_y), test_X, test_y))
print("LinearRegression", rmse(LinearRegression().fit(train_X, train_y), test_X, test_y))

Ridge 21.395689707502864
LinearRegression 32.32637112653577
