In [1]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [2]:
data = pd.read_csv('datasets/diamond_processed_venkat.csv', index_col=0)
data.head()

Unnamed: 0_level_0,price,cut_Fair,cut_Good,cut_Ideal,cut_Premium,cut_Very Good,color_D,color_E,color_F,color_G,color_H,color_I,color_J,carat,depth,table,x,y,z
clarity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
3,1076,0,0,1,0,0,1,0,0,0,0,0,0,-0.808459,0.46897,-1.537145,-0.840716,-0.884647,-0.757601
3,5370,0,0,0,0,1,0,0,0,0,1,0,0,0.525131,-1.91644,0.69289,0.843892,0.873896,0.548199
4,732,0,0,0,1,0,1,0,0,0,0,0,0,-1.037669,-0.37294,0.69289,-1.230836,-1.286344,-1.19731
5,936,0,0,1,0,0,0,0,0,1,0,0,0,-0.995995,-0.443099,-1.091138,-1.133306,-1.170298,-1.104038
1,9385,0,0,0,1,0,0,0,0,0,1,0,0,1.650347,0.609288,0.69289,1.500002,1.489833,1.467589


In [3]:
data.shape

(10788, 19)

In [4]:
from sklearn.linear_model import LinearRegression

def linear_model(x_train, y_train):
    print("Working on Linear Regression Model")
    
    lr = LinearRegression()
    lr.fit(x_train, y_train)
    
    return lr

In [5]:
from sklearn.linear_model import Lasso

def lasso_model(x_train, y_train):
    print("Working on Lasso Regression Model")
    
    ls = Lasso(alpha=0.8, max_iter=10000)
    ls.fit(x_train, y_train)
    
    return ls

In [6]:
from sklearn.linear_model import Ridge

def ridge_model(x_train, y_train):
    print("Working on Ridge Regression Model")
    
    rd = Ridge(alpha=0.9)
    rd.fit(x_train, y_train)
    
    return rd

In [7]:
def build_and_train_model(data, target_name, reg_fn):
    X = data.drop(target_name, axis=1)
    Y = data[target_name]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)
    model = reg_fn(x_train, y_train)
    
    score = model.score(x_train, y_train)
    print("Training Score: ", score)
    
    y_pred = model.predict(x_test)
    r_score = r2_score(y_test, y_pred)
    
    print("Testing Score: ", r_score)
    
    return {"model" : model,
           "x_train": x_train,
           "y_train": y_train,
           "x_test": x_test,
           "y_test": y_test,
           "y_pred": y_pred}
    

In [8]:
linear_regression = build_and_train_model(data, "price", linear_model)

Working on Linear Regression Model
Training Score:  0.8809242407127091
Testing Score:  0.8886752199013835


In [9]:
lasso_regression = build_and_train_model(data, "price", lasso_model)

Working on Lasso Regression Model
Training Score:  0.8808343488739523
Testing Score:  0.8885207118145195


In [10]:
ridge_regression = build_and_train_model(data, "price", ridge_model)

Working on Ridge Regression Model
Training Score:  0.8809143924104876
Testing Score:  0.8886266890933923


In [23]:
from sklearn.linear_model import SGDRegressor

def sgd_model(x_train, y_train):
    print("Working on Stochastic Gradient Descent Regression Model")
    
    sgd = SGDRegressor(max_iter=2500)
    sgd.fit(x_train, y_train)
    
    return sgd

In [24]:
sgd_regression = build_and_train_model(data, "price", sgd_model)

Working on Stochastic Gradient Descent Regression Model
Training Score:  0.874944686446041
Testing Score:  0.8863935303789885
