In [1]:
#import

import duckdb
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

pd.set_option('display.max_columns', 16)

# query

In [2]:
con = duckdb.connect("./data/diamonds_train.db",read_only=True)
con

<duckdb.duckdb.DuckDBPyConnection at 0x7fb72c27b8f0>

In [3]:
con = duckdb.connect(database="./data/diamonds_train.db", read_only=True)
query = "SHOW TABLES"
tables = con.execute(query).fetchall()

for table in tables:
    print(table)


('diamonds_city',)
('diamonds_clarity',)
('diamonds_color',)
('diamonds_cut',)
('diamonds_dimensions',)
('diamonds_properties',)
('diamonds_transactional',)


In [4]:
import duckdb
import pandas as pd

con = duckdb.connect(database="./data/diamonds_train.db", read_only=True)

# Realizar uniones utilizando SQL en DuckDB
query = """
SELECT dt.index_id, dt.price, dt.carat, dd.depth, dd.table, dd.x, dd.y, dd.z, 
       dc.city, dp.cut_id, dp.color_id, dp.clarity_id,
       dcl.clarity, dco.color, dcut.cut
FROM diamonds_transactional dt
INNER JOIN diamonds_dimensions dd ON dt.index_id = dd.index_id
INNER JOIN diamonds_city dc ON dt.city_id = dc.city_id
INNER JOIN diamonds_properties dp ON dt.index_id = dp.index_id
INNER JOIN diamonds_clarity dcl ON dp.clarity_id = dcl.clarity_id
INNER JOIN diamonds_color dco ON dp.color_id = dco.color_id
INNER JOIN diamonds_cut dcut ON dp.cut_id = dcut.cut_id

"""

df = con.execute(query).df()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index_id    40455 non-null  object 
 1   price       40455 non-null  int64  
 2   carat       40455 non-null  float64
 3   depth       40455 non-null  float64
 4   table       40455 non-null  float64
 5   x           40455 non-null  float64
 6   y           40455 non-null  float64
 7   z           40455 non-null  float64
 8   city        40455 non-null  object 
 9   cut_id      40455 non-null  object 
 10  color_id    40455 non-null  object 
 11  clarity_id  40455 non-null  object 
 12  clarity     40455 non-null  object 
 13  color       40455 non-null  object 
 14  cut         40455 non-null  object 
dtypes: float64(6), int64(1), object(8)
memory usage: 4.6+ MB


# check df

In [5]:
# COMPROBACION DE NULOS

for columna in df.columns:
    nulos = df[columna].isnull().sum()
    unicos = df[columna].nunique()
    print(f"Columna {columna}: {nulos} nulos, {unicos} valores únicos")


Columna index_id: 0 nulos, 40455 valores únicos
Columna price: 0 nulos, 10378 valores únicos
Columna carat: 0 nulos, 270 valores únicos
Columna depth: 0 nulos, 175 valores únicos
Columna table: 0 nulos, 121 valores únicos
Columna x: 0 nulos, 543 valores únicos
Columna y: 0 nulos, 539 valores únicos
Columna z: 0 nulos, 363 valores únicos
Columna city: 0 nulos, 13 valores únicos
Columna cut_id: 0 nulos, 5 valores únicos
Columna color_id: 0 nulos, 7 valores únicos
Columna clarity_id: 0 nulos, 8 valores únicos
Columna clarity: 0 nulos, 8 valores únicos
Columna color: 0 nulos, 7 valores únicos
Columna cut: 0 nulos, 5 valores únicos


In [6]:
df.to_csv('./data/diamonds_full.csv')

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   index_id    40455 non-null  object 
 1   price       40455 non-null  int64  
 2   carat       40455 non-null  float64
 3   depth       40455 non-null  float64
 4   table       40455 non-null  float64
 5   x           40455 non-null  float64
 6   y           40455 non-null  float64
 7   z           40455 non-null  float64
 8   city        40455 non-null  object 
 9   cut_id      40455 non-null  object 
 10  color_id    40455 non-null  object 
 11  clarity_id  40455 non-null  object 
 12  clarity     40455 non-null  object 
 13  color       40455 non-null  object 
 14  cut         40455 non-null  object 
dtypes: float64(6), int64(1), object(8)
memory usage: 4.6+ MB


In [8]:
#float columns
df.describe()

Unnamed: 0,price,carat,depth,table,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,3928.444469,0.797706,61.752841,57.446133,5.729392,5.732819,3.537154
std,3992.416147,0.475544,1.431725,2.233535,1.124453,1.14665,0.697062
min,326.0,0.2,43.0,43.0,0.0,0.0,0.0
25%,945.0,0.4,61.0,56.0,4.71,4.72,2.91
50%,2397.0,0.7,61.8,57.0,5.69,5.71,3.52
75%,5331.0,1.04,62.5,59.0,6.54,6.54,4.035
max,18823.0,4.5,79.0,95.0,10.23,58.9,8.06


In [9]:
df_float = df[["price","carat","depth","table","x","y","z"]]
df_float.corr()

Unnamed: 0,price,carat,depth,table,x,y,z
price,1.0,0.921935,-0.014864,0.130111,0.885848,0.866163,0.8745
carat,0.921935,1.0,0.026528,0.183392,0.975688,0.951667,0.96757
depth,-0.014864,0.026528,1.0,-0.293114,-0.026348,-0.030966,0.094655
table,0.130111,0.183392,-0.293114,1.0,0.196059,0.184673,0.155189
x,0.885848,0.975688,-0.026348,0.196059,1.0,0.973712,0.984876
y,0.866163,0.951667,-0.030966,0.184673,0.973712,1.0,0.964828
z,0.8745,0.96757,0.094655,0.155189,0.984876,0.964828,1.0


In [10]:
# FEATURES + TARGET

X = df_float[["carat","depth","table","x","y","z"]]
y = pd.get_dummies(df_float[["price"]], drop_first=True)
print(X.shape,y.shape)

(40455, 6) (40455, 1)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Scaling - RobustScaler
scaler = RobustScaler()
scaled_data_rs = scaler.fit_transform(X_train)
scaled_data_rs
scaled_data_test = scaler.transform(X_test)
scaled_data_test

array([[ 1.3125    ,  0.13333333, -0.33333333,  0.99453552,  0.93406593,
         0.97345133],
       [-0.609375  ,  0.86666667,  0.        , -0.75956284, -0.75824176,
        -0.7079646 ],
       [-0.296875  ,  0.        , -0.33333333, -0.32240437, -0.3021978 ,
        -0.30973451],
       ...,
       [ 0.5       , -0.4       ,  0.33333333,  0.44262295,  0.42307692,
         0.39823009],
       [ 0.78125   ,  1.06666667, -0.66666667,  0.56284153,  0.52197802,
         0.63716814],
       [ 0.8125    , -1.33333333,  0.66666667,  0.69398907,  0.67032967,
         0.55752212]])

In [13]:
regressor =  LinearRegression()

In [14]:
hyperparameters = regressor.get_params()
model = regressor.fit(X_train, y_train)

In [15]:
diamond = pd.read_csv('./data/diamonds_test.csv')
diamond

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [16]:
diamond_test = diamond[["carat","depth","table","x","y","z"]]
diamond_test

Unnamed: 0,carat,depth,table,x,y,z
0,0.79,62.7,60.0,5.82,5.89,3.67
1,1.20,61.0,57.0,6.81,6.89,4.18
2,1.57,62.2,61.0,7.38,7.32,4.57
3,0.90,63.8,54.0,6.09,6.13,3.90
4,0.50,62.9,58.0,5.05,5.09,3.19
...,...,...,...,...,...,...
13480,0.57,61.9,56.0,5.35,5.32,3.30
13481,0.71,62.2,55.0,5.71,5.73,3.56
13482,0.70,61.6,55.0,5.75,5.71,3.53
13483,0.70,58.8,57.0,5.85,5.89,3.45


In [17]:
X_test.columns == diamond_test.columns

array([ True,  True,  True,  True,  True,  True])

In [18]:
y_pred = model.predict(X_test)

print('Model:', regressor, '\n')
print('Model hyperparameters:', hyperparameters, '\n')
#print('Ground truth target:', y_test, '\n')
print('Predicted target:', y_pred, '\n')

Model: LinearRegression() 

Model hyperparameters: {'copy_X': True, 'fit_intercept': True, 'n_jobs': None, 'positive': False} 

Predicted target: [[9765.15479122]
 [ 238.67058371]
 [1772.76964233]
 ...
 [5417.91439864]
 [6909.31389084]
 [7184.85810982]] 



In [19]:
y_pred.shape

(8091, 1)

In [20]:
r2_ = model.score(X_test, y_test)
r2_

0.8587524706860848

In [21]:
v = pd.DataFrame(y_pred)
v.columns
type(v)
len(v)

8091

In [22]:
y_pred

array([[9765.15479122],
       [ 238.67058371],
       [1772.76964233],
       ...,
       [5417.91439864],
       [6909.31389084],
       [7184.85810982]])

In [23]:
len(diamond_test)

13485

In [24]:
y_hat = model.predict(diamond_test)

In [25]:
y_hat.shape

(13485, 1)

In [26]:
y_hat

array([[3279.16482375],
       [7114.66566561],
       [9696.89320755],
       ...,
       [3140.84579915],
       [3415.40673269],
       [1062.45940084]])

In [27]:
type(y_hat)

numpy.ndarray

In [28]:
df_pred = pd.DataFrame(data=y_hat)
df_pred

Unnamed: 0,0
0,3279.164824
1,7114.665666
2,9696.893208
3,4500.720630
4,1286.615455
...,...
13480,2078.159382
13481,3173.815589
13482,3140.845799
13483,3415.406733


In [29]:

df_rename = df_pred.rename(columns={"index": "id", "o": "price"})
df_rename

Unnamed: 0,0
0,3279.164824
1,7114.665666
2,9696.893208
3,4500.720630
4,1286.615455
...,...
13480,2078.159382
13481,3173.815589
13482,3140.845799
13483,3415.406733


In [30]:
df_pred = df_pred.reset_index()
df_pred

Unnamed: 0,index,0
0,0,3279.164824
1,1,7114.665666
2,2,9696.893208
3,3,4500.720630
4,4,1286.615455
...,...,...
13480,13480,2078.159382
13481,13481,3173.815589
13482,13482,3140.845799
13483,13483,3415.406733


In [31]:
type(df_pred)

pandas.core.frame.DataFrame

In [32]:

df_rename = df_pred.rename(columns={"index": "id", "o": "price"})
df_rename

Unnamed: 0,id,0
0,0,3279.164824
1,1,7114.665666
2,2,9696.893208
3,3,4500.720630
4,4,1286.615455
...,...,...
13480,13480,2078.159382
13481,13481,3173.815589
13482,13482,3140.845799
13483,13483,3415.406733


In [33]:
final_prediction = df_rename.rename(columns={df_rename.columns[1]: "price" })
final_prediction

Unnamed: 0,id,price
0,0,3279.164824
1,1,7114.665666
2,2,9696.893208
3,3,4500.720630
4,4,1286.615455
...,...,...
13480,13480,2078.159382
13481,13481,3173.815589
13482,13482,3140.845799
13483,13483,3415.406733


In [34]:
submission = final_prediction.to_csv('./data/submission.csv', index=False)

In [35]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
rmse




1497.8078842384916