In [135]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [136]:
df = pd.read_csv('NPPE1_Preprocessing1.csv')

In [137]:
df.shape

(4000, 14)

In [138]:
df.PRICE.mean()

np.float64(24.355923220694248)

In [139]:
# (df.RM >=5).value_counts()

df[(df['RM']>=5)].value_counts()

CRIM       ZN         INDUS      POLINDEX  RM   AGE    DIS       HIGHWAYCOUNT  TAX         PTRATIO    IMM         BPL        PRICE      RIVERSIDE
0.031093   60.870668  3.547503   0.405085  8.0  10.0   6.555165  1             266.520577  16.413567  393.392429  5.854484   31.107483  NO           1
0.032349   17.904842  2.514424   0.831878  7.0  61.0   9.977147  3             216.187925  18.606447  394.484087  9.130782   33.794713  UNKNOWN      1
0.033147   95.036288  3.816744   1.914158  9.0  32.0   5.789218  4             224.662773  15.680052  391.019090  5.479476   50.069258  NO           1
0.036337   18.478661  1.852647   1.205738  8.0  60.0   9.831770  3             217.877869  19.214399  393.315545  8.132515   33.398435  UNKNOWN      1
0.040731   82.645120  3.407705   1.344175  8.0  16.0   7.495832  3             350.248257  14.921488  396.224069  3.495689   42.833404  NO           1
                                                                                                   

In [140]:
df.PRICE.nlargest(10).mean()

np.float64(52.36590175716407)

In [141]:
df['RM'].unique()
df[df['RM']==-1].count()

CRIM            40
ZN              40
INDUS           40
POLINDEX        40
RM              40
AGE             40
DIS             40
HIGHWAYCOUNT    40
TAX             40
PTRATIO         40
IMM             40
BPL             40
PRICE           40
RIVERSIDE       40
dtype: int64

In [142]:
df.AGE.unique()
df[df['AGE']<0].shape

(50, 14)

In [143]:
df.RIVERSIDE.unique()

array(['NO', 'UNKNOWN', 'YES'], dtype=object)

In [144]:
df[df['RIVERSIDE']=='UNKNOWN'].shape

(88, 14)

In [145]:
df.HIGHWAYCOUNT.isin([6,7,8]).value_counts()

HIGHWAYCOUNT
False    2789
True     1211
Name: count, dtype: int64

In [146]:
df['RIVERSIDE'] = df['RIVERSIDE'].replace('UNKNOWN', np.nan)
df['AGE'] = df['AGE'].replace(-2, np.nan)
df['RM'] = df['RM'].replace(-1, np.nan)

In [147]:
from sklearn.model_selection import train_test_split
X = df.drop('PRICE', axis=1)
y = df['PRICE']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [148]:
X_train.shape

(2800, 13)

In [149]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer

trf = ColumnTransformer([
    ('trf1', MinMaxScaler(), ['CRIM', 'ZN', 'POLINDEX', 'DIS', 'HIGHWAYCOUNT', 'TAX', 'PTRATIO', 'IMM', 'BPL']),
    ('trf2', StandardScaler(), ['INDUS'])
],remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

trf1 = ColumnTransformer([
    ('trf1', SimpleImputer(strategy='median'), ['RM']),
    ('trf2', SimpleImputer(strategy='mean'), ['AGE']),
    ('trf3', SimpleImputer(strategy='most_frequent'), ['RIVERSIDE'])
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

trf2 = ColumnTransformer([
    ('trf1', MinMaxScaler(), ['RM', 'AGE']),
    ('trf2', OneHotEncoder(sparse_output=False), ['RIVERSIDE'])
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')

X_train = trf.fit_transform(X_train)
X_test = trf.transform(X_test)

X_train = trf1.fit_transform(X_train)
X_test = trf1.transform(X_test)

X_train = trf2.fit_transform(X_train)
X_test = trf2.transform(X_test)

In [150]:
X_train.shape

(2800, 14)

In [151]:
X_test.mean().mean()

np.float64(0.38617456550975626)

In [152]:
df1 = pd.read_csv('TESTING.csv')


In [153]:
X = df1.drop('14', axis=1)
y = df1['14']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [154]:
from sklearn.linear_model import Lasso, Ridge
ridge = Ridge(alpha=10, solver='saga', tol=1e-4, random_state=42)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

0.6613547575262211

In [155]:
abs(ridge.coef_).argmax()

np.int64(9)

In [156]:
abs(ridge.coef_).argmin()

np.int64(0)

In [157]:
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge,RidgeCV, SGDRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score, cross_validate,KFold 
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.preprocessing import PolynomialFeatures,StandardScaler,OneHotEncoder,OrdinalEncoder
from sklearn.feature_selection import RFE

In [158]:
sgd = SGDRegressor(random_state=42)
param_grid = {
    'penalty': ['l1', 'l2'],
    'alpha': [1e-5, 1e-4, 1e-3, 1e-2, 1e-1],
    'tol': [1e-4, 1e-3, 1e-2, 1e-1]
}
gscv = GridSearchCV(sgd, param_grid, cv=5, scoring='neg_mean_absolute_error')
gscv.fit(X_train, y_train)

0,1,2
,estimator,SGDRegressor(random_state=42)
,param_grid,"{'alpha': [1e-05, 0.0001, ...], 'penalty': ['l1', 'l2'], 'tol': [0.0001, 0.001, ...]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'squared_error'
,penalty,'l2'
,alpha,0.001
,l1_ratio,0.15
,fit_intercept,True
,max_iter,1000
,tol,0.0001
,shuffle,True
,verbose,0
,epsilon,0.1


In [159]:
gscv.best_params_

{'alpha': 0.001, 'penalty': 'l2', 'tol': 0.0001}

In [160]:
mean_absolute_error(y_test, gscv.predict(X_test))

3.8131121797994014

In [161]:
pipe =  Pipeline([
    ('pca', PCA()),
    ('lasso', Lasso())
])
param_grid = {
    'pca__n_components': [0.9,0.95],
    'lasso__alpha': [10,1,0.01,0.001]
}

gscv1 = GridSearchCV(pipe, param_grid, cv=5, n_jobs=-1, scoring='neg_mean_absolute_error')
gscv1.fit(X_train, y_train)

0,1,2
,estimator,"Pipeline(step...o', Lasso())])"
,param_grid,"{'lasso__alpha': [10, 1, ...], 'pca__n_components': [0.9, 0.95]}"
,scoring,'neg_mean_absolute_error'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_components,0.95
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,

0,1,2
,alpha,0.01
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [162]:
r2_score(y_test, gscv1.predict(X_test))

0.6288625430197583

In [163]:
gscv1.best_estimator_.named_steps['pca'].explained_variance_[0]

np.float64(1.163507574223904)

In [164]:
pipe1 = Pipeline([
    ('poly', PolynomialFeatures(degree=2, interaction_only=False)),
    ('lasso', Lasso(alpha=1, warm_start=True, random_state=0))
])

pipe1.fit(X_train,y_train)

0,1,2
,steps,"[('poly', ...), ('lasso', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,degree,2
,interaction_only,False
,include_bias,True
,order,'C'

0,1,2
,alpha,1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,True
,positive,False
,random_state,0
,selection,'cyclic'


In [165]:
r2_score(y_test, pipe1.predict(X_test))

0.157678032410551

In [166]:
rfe = RFE(LinearRegression())
rfe.fit(X_train, y_train)
rfe.ranking_

array([4, 2, 8, 7, 1, 1, 1, 1, 1, 1, 1, 5, 3, 6])