In [1]:
#import libraries
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge, LassoLars, BayesianRidge, ElasticNet
from sklearn.metrics import max_error, median_absolute_error, mean_squared_error, r2_score, explained_variance_score


# Read in cleaned data into a Pandas DataFrame

In [2]:
file = Path("Resources/clean_data_v.csv")
df = pd.read_csv(file, index_col='Unnamed: 0')
df.head()

Unnamed: 0,carat,depth,table,price,x,y,z,color_D,color_E,color_F,...,cut_Premium,cut_Very Good,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
1,0.23,61.5,55.0,326,3.95,3.98,2.43,0,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0.21,59.8,61.0,326,3.89,3.84,2.31,0,1,0,...,1,0,0,0,1,0,0,0,0,0
3,0.23,56.9,65.0,327,4.05,4.07,2.31,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0.29,62.4,58.0,334,4.2,4.23,2.63,0,0,0,...,1,0,0,0,0,0,0,1,0,0
5,0.31,63.3,58.0,335,4.34,4.35,2.75,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [3]:
y = df['price']
X = df.drop(columns = 'price')

# Scale the data

In [4]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

# Split the data into Train and Test Data sets


In [5]:
X_train , X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=7)

In [6]:
#set up a results data frame

results = pd.DataFrame(index = ['max_error', 'median_absolute_error', 'mean_squared_error', 'r2_score', 'explained_variance_score'], 
                       columns = ['LinearRegression', 'LogisticRegression', 'Lasso', 'Ridge', 'LassoLars', 'BayesianRidge', 'ElasticNet' ])



# Linear Regression model


In [7]:
linear_reg = LinearRegression()

linear_reg.fit(X_train, y_train)

y_pred = linear_reg.predict(X_test)

results.loc['max_error','LinearRegression'] = max_error(y_test, y_pred)
results.loc['median_absolute_error','LinearRegression'] = median_absolute_error(y_test, y_pred)
results.loc['mean_squared_error','LinearRegression'] = mean_squared_error(y_test, y_pred)
results.loc['r2_score','LinearRegression'] = r2_score(y_test, y_pred)
results.loc['explained_variance_score','LinearRegression'] = explained_variance_score(y_test, y_pred)


# Logistic Regression model

In [8]:
# log_reg = LogisticRegression()

# log_reg.fit(X_train, y_train)

# y_pred_lg = log_reg.predict(X_test)

# results.loc['max_error','LogisticRegression'] = max_error(y_test, y_pred_lg)
# results.loc['median_absolute_error','LogisticRegression'] = median_absolute_error(y_test, y_pred_lg)
# results.loc['mean_squared_error','LogisticRegression'] = mean_squared_error(y_test, y_pred_lg)
# results.loc['r2_score','LogisticRegression'] = r2_score(y_test, y_pred_lg)
# results.loc['explained_variance_score','LogisticRegression'] = explained_variance_score(y_test, y_pred_lg)

# Lasso Regression model

In [9]:
lasso_reg = Lasso()

lasso_reg.fit(X_train, y_train)

y_pred_lasso = lasso_reg.predict(X_test)

results.loc['max_error','Lasso'] = max_error(y_test, y_pred_lasso)
results.loc['median_absolute_error','Lasso'] = median_absolute_error(y_test, y_pred_lasso)
results.loc['mean_squared_error','Lasso'] = mean_squared_error(y_test, y_pred_lasso)
results.loc['r2_score','Lasso'] = r2_score(y_test, y_pred_lasso)
results.loc['explained_variance_score','Lasso'] = explained_variance_score(y_test, y_pred_lasso)

# Ridge Regression model

In [10]:
ridge_reg = Ridge()

ridge_reg.fit(X_train, y_train)

y_pred_ridge = ridge_reg.predict(X_test)

results.loc['max_error','Ridge'] = max_error(y_test, y_pred_ridge)
results.loc['median_absolute_error','Ridge'] = median_absolute_error(y_test, y_pred_ridge)
results.loc['mean_squared_error','Ridge'] = mean_squared_error(y_test, y_pred_ridge)
results.loc['r2_score','Ridge'] = r2_score(y_test, y_pred_ridge)
results.loc['explained_variance_score','Ridge'] = explained_variance_score(y_test, y_pred_ridge)

# LassoLars model

In [11]:
lars = LassoLars(alpha=.1)

lars.fit(X_train, y_train)

y_pred_lars = lars.predict(X_test)

results.loc['max_error','LassoLars'] = max_error(y_test, y_pred_lars)
results.loc['median_absolute_error','LassoLars'] = median_absolute_error(y_test, y_pred_lars)
results.loc['mean_squared_error','LassoLars'] = mean_squared_error(y_test, y_pred_lars)
results.loc['r2_score','LassoLars'] = r2_score(y_test, y_pred_lars)
results.loc['explained_variance_score','LassoLars'] = explained_variance_score(y_test, y_pred_lars)

# BayesianRidge model

In [12]:
br_model = BayesianRidge()
br_model.fit(X_train, y_train)

y_pred_br = br_model.predict(X_test)

results.loc['max_error','BayesianRidge'] = max_error(y_test, y_pred_br)
results.loc['median_absolute_error','BayesianRidge'] = median_absolute_error(y_test, y_pred_br)
results.loc['mean_squared_error','BayesianRidge'] = mean_squared_error(y_test, y_pred_br)
results.loc['r2_score','BayesianRidge'] = r2_score(y_test, y_pred_br)
results.loc['explained_variance_score','BayesianRidge'] = explained_variance_score(y_test, y_pred_br)

# ElasticNet model

In [13]:
en_model = ElasticNet()
en_model.fit(X_train, y_train)

y_pred_en = en_model.predict(X_test)

results.loc['max_error','ElasticNet'] = max_error(y_test, y_pred_en)
results.loc['median_absolute_error','ElasticNet'] = median_absolute_error(y_test, y_pred_en)
results.loc['mean_squared_error','ElasticNet'] = mean_squared_error(y_test, y_pred_en)
results.loc['r2_score','ElasticNet'] = r2_score(y_test, y_pred_en)
results.loc['explained_variance_score','ElasticNet'] = explained_variance_score(y_test, y_pred_en)

In [14]:
results.head()

Unnamed: 0,LinearRegression,LogisticRegression,Lasso,Ridge,LassoLars,BayesianRidge,ElasticNet
max_error,15356.3,,15209.0,15343.2,12553.3,15343.3,19275.6
median_absolute_error,530.084,,531.192,530.45,553.816,530.452,756.033
mean_squared_error,1254080.0,,1254920.0,1254150.0,1303970.0,1254150.0,2439750.0
r2_score,0.920623,,0.92057,0.920618,0.917465,0.920618,0.845575
explained_variance_score,0.920624,,0.920571,0.92062,0.917468,0.92062,0.845578


In [16]:
results.to_csv('Resources/Linear_models_results.csv')

In [None]:
# # List the features sorted in descending order by feature importance
# importances = linear_reg.feature_importances_

# importances_df = pd.DataFrame(sorted(zip(  linear_reg.feature_importances_, X_train.columns), reverse=True))
# importances_df.set_index(importances_df[1], inplace=True)
# importances_df.drop(columns=1, inplace=True)
# importances_df.rename(columns={0: 'Feature Importances'}, inplace=True)
# importances_sorted = importances_df.sort_values(by='Feature Importances')
# importances_sorted.plot(kind='barh', color='lightgreen', title= 'Features Importances', legend=False, figsize = (20,20))


