In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold, train_test_split


url="https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX&export=download"
data=pd.read_csv(url)


X=data.drop('Price',axis=1).values
y=data['Price'].values.reshape(-1, 1)

scaler=StandardScaler()
X=scaler.fit_transform(X)

def add_intercept(X):
    return np.hstack([np.ones((X.shape[0],1)),X])
kf=KFold(n_splits=5,shuffle=True,random_state=42)
best_r2=-np.inf
best_beta=None

for fold,(train_idx,test_idx) in enumerate(kf.split(X), 1):
    X_train,X_test=X[train_idx],X[test_idx]
    y_train,y_test=y[train_idx],y[test_idx]

    X_train_intercept=add_intercept(X_train)

    beta=np.linalg.inv(X_train_intercept.T @ X_train_intercept) @ (X_train_intercept.T @ y_train)

    X_test_intercept=add_intercept(X_test)
    y_pred=X_test_intercept @ beta

    r2=r2_score(y_test, y_pred)
    print(f"Fold {fold} R2 score: {r2:.4f}")

    if r2>best_r2:
        best_r2=r2
        best_beta=beta

print(f"Best R2 score from CV:{best_r2:.4f}")

X_train_final,X_test_final,y_train_final,y_test_final=train_test_split(X,y,test_size=0.3,random_state=42)

X_train_final_intercept=add_intercept(X_train_final)
best_beta_final = np.linalg.inv(X_train_final_intercept.T @ X_train_final_intercept) @ (X_train_final_intercept.T @ y_train_final)

X_test_final_intercept=add_intercept(X_test_final)
y_pred_final=X_test_final_intercept @ best_beta_final

final_r2=r2_score(y_test_final,y_pred_final)
print(f"Final test R2 score (70/30 split):{final_r2:.4f}")



Fold 1 R2 score: 0.9180
Fold 2 R2 score: 0.9146
Fold 3 R2 score: 0.9116
Fold 4 R2 score: 0.9193
Fold 5 R2 score: 0.9244
Best R2 score from CV:0.9244
Final test R2 score (70/30 split):0.9147


In [6]:
print(data.columns)

Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price'],
      dtype='object')


In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

url="https://drive.google.com/uc?id=1O_NwpJT-8xGfU_-3llUl2sgPu0xllOrX&export=download"
data=pd.read_csv(url)

col='Price'
X=data.drop(col, axis=1).values
y=data[col].values.reshape(-1,1)

scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)

def add_intercept(X):
    return np.hstack([np.ones((X.shape[0],1)),X])

X_temp,X_test,y_temp,y_test=train_test_split(X_scaled,y,test_size=0.3,random_state=42)
X_train,X_val,y_train,y_val=train_test_split(X_temp,y_temp,test_size=0.2,random_state=42)

X_train=add_intercept(X_train)
X_val=add_intercept(X_val)
X_test=add_intercept(X_test)

def gradient_descent(X, y, beta, lr, iters):
    m=len(y)
    for _ in range(iters):
        error=X @ beta - y
        grad=(X.T @ error)/m
        beta=beta-(lr*grad)
    return beta

lrs=[0.001,0.01,0.1,1]
iters=1000
best_val_r2=-np.inf
best_beta=None
best_lr=None

for lr in lrs:
    beta=np.zeros((X_train.shape[1],1))
    beta=gradient_descent(X_train,y_train,beta,lr,iters)
    val_r2=r2_score(y_val,X_val @ beta)
    test_r2=r2_score(y_test,X_test @ beta)
    print(f"LR:{lr}|Val R2: {val_r2:.4f}|Test R2: {test_r2:.4f}")
    if val_r2>best_val_r2:
        best_val_r2=val_r2
        best_beta=beta
        best_lr=lr

print(f"Best LR:{best_lr} with Val R2:{best_val_r2:.4f}")
print(f"Test R2 with best model:{r2_score(y_test, X_test @ best_beta):.4f}")


LR:0.001|Val R2: -0.8125|Test R2: -0.9914
LR:0.01|Val R2: 0.9098|Test R2: 0.9147
LR:0.1|Val R2: 0.9098|Test R2: 0.9148
LR:1|Val R2: 0.9098|Test R2: 0.9148
Best LR:0.01 with Val R2:0.9098
Test R2 with best model:0.9147


In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score

cols=["symboling","normalized_losses","make","fuel_type","aspiration","num_doors","body_style","drive_wheels","engine_location","wheel_base","length","width","height","curb_weight","engine_type","num_cylinders","engine_size","fuel_system","bore","stroke","compression_ratio","horsepower","peak_rpm","city_mpg","highway_mpg","price"]
url="https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data"
df=pd.read_csv(url,names=cols,na_values='?')

df.dropna(subset=['price'],inplace=True)
num_cols=df.select_dtypes(include=[np.number]).columns.tolist()
cat_cols=df.select_dtypes(include=[object]).columns.tolist()

for c in num_cols:
    if df[c].isnull().any():
        df[c].fillna(df[c].mean(),inplace=True)
for c in cat_cols:
    if df[c].isnull().any():
        df[c].fillna(df[c].mode()[0],inplace=True)

words={'two':2,'three':3,'four':4,'five':5,'six':6,'eight':8,'twelve':12}
df['num_doors']=df['num_doors'].map(words)
df['num_cylinders']=df['num_cylinders'].map(words)

df=pd.get_dummies(df,columns=['body_style','drive_wheels'],drop_first=True)

for c in ['make','aspiration','engine_location','fuel_type']:
    df[c]=LabelEncoder().fit_transform(df[c])

df['fuel_system']=df['fuel_system'].apply(lambda x:1 if 'pfi' in x else 0)
df['engine_type']=df['engine_type'].apply(lambda x:1 if 'ohc' in x else 0)

X=df.drop('price',axis=1)
y=df['price'].astype(float)

X_scaled=StandardScaler().fit_transform(X)
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y,test_size=0.3,random_state=42)

model=LinearRegression().fit(X_train,y_train)
print("R2 without PCA:",r2_score(y_test,model.predict(X_test)))

pca=PCA(n_components=0.95)
X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.transform(X_test)

model_pca=LinearRegression().fit(X_train_pca,y_train)
print("R2 with PCA:",r2_score(y_test,model_pca.predict(X_test_pca)))


R2 without PCA: 0.8732775682086301
R2 with PCA: 0.85376827630017


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mean(),inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[c].fillna(df[c].mode()[0],inplace=True)
