## 1 Extract File

In [1]:
from zipfile import ZipFile

file_path = 'source/diamonds.csv.zip'
with ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall('./source/')


## 2 Open File as a Pandas Dataframe

In [2]:
import pandas as pd

df = pd.read_csv('source/diamonds.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


## 3 Quick Exploratory Data Analysis

In [3]:
_deepnote_run_altair(df, """{"$schema":"https://vega.github.io/schema/vega-lite/v4.json","mark":{"type":"bar","tooltip":false},"height":220,"autosize":{"type":"fit"},"data":{"name":"placeholder"},"encoding":{"x":{"field":"carat","type":"quantitative","sort":null,"scale":{"type":"linear","zero":false}},"y":{"field":"price","type":"quantitative","sort":null,"scale":{"type":"linear","zero":true},"bin":false},"color":{"field":"","type":"nominal","sort":null,"scale":{"type":"linear","zero":false}}}}""")

## 4 Train Test Splitting

In [4]:
from sklearn.model_selection import train_test_split
x = df[['carat', 'depth', 'table', 'x', 'y', 'z']]
y = df.price
train_x, test_x, train_y, test_y = train_test_split(x, y,random_state = 2,test_size=0.2)

## 5 Model Training

In [5]:
from sklearn import linear_model

model = linear_model.LinearRegression()
model.fit(train_x,train_y)

LinearRegression()

## 6 Model Evaluation

In [6]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

y_pred = model.predict(test_x)

metrics = {}
metrics['accuracy'] = model.score(test_x,test_y)*100
metrics['MAE'] = mean_absolute_error(test_y,y_pred)
metrics['MSE'] = mean_absolute_error(test_y,y_pred)
metrics['R2'] = r2_score(test_y,y_pred)

R2 = r2_score(test_y,y_pred)
n=test_x.shape[0]
p=test_x.shape[1] - 1

metrics['Adjusted R2'] = 1 - (1 - R2) * ((n - 1)/(n-p-1))

## 7 Save Metrics to Google Drive

In [7]:
import json

gdrive_path = '/datasets/output/'

with open(gdrive_path + 'metrics.json', 'w') as fp:
    json.dump(metrics,fp)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=11517204-e819-4a62-b6e1-9b8ef71416a5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>