In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

In [33]:
diamonds = pd.read_csv('./data/diamonds_full.csv')

In [34]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Unnamed: 0  40455 non-null  int64  
 1   index_id    40455 non-null  object 
 2   price       40455 non-null  int64  
 3   carat       40455 non-null  float64
 4   depth       40455 non-null  float64
 5   table       40455 non-null  float64
 6   x           40455 non-null  float64
 7   y           40455 non-null  float64
 8   z           40455 non-null  float64
 9   city        40455 non-null  object 
 10  cut_id      40455 non-null  object 
 11  color_id    40455 non-null  object 
 12  clarity_id  40455 non-null  object 
 13  clarity     40455 non-null  object 
 14  color       40455 non-null  object 
 15  cut         40455 non-null  object 
dtypes: float64(6), int64(2), object(8)
memory usage: 4.9+ MB


In [36]:
diamonds = diamonds.drop(['Unnamed: 0','index_id','cut_id', 'color_id', 'clarity_id'], axis = 1)

In [37]:
diamonds

Unnamed: 0,price,carat,depth,table,x,y,z,city,clarity,color,cut
0,4268,1.21,62.4,58.0,6.83,6.79,4.25,Dubai,VS2,J,Premium
1,505,0.32,63.0,57.0,4.35,4.38,2.75,Kimberly,VS2,H,Very Good
2,2686,0.71,65.5,55.0,5.62,5.53,3.65,Las Vegas,VS1,G,Fair
3,738,0.41,63.8,56.0,4.68,4.72,3.00,Kimberly,SI1,D,Good
4,4882,1.02,60.5,59.0,6.55,6.51,3.95,Dubai,SI1,G,Ideal
...,...,...,...,...,...,...,...,...,...,...,...
40450,5850,1.08,61.9,54.0,6.64,6.61,4.10,Antwerp,VS2,H,Ideal
40451,6300,1.15,61.8,55.0,6.73,6.76,4.17,Luxembourg,SI1,I,Ideal
40452,1800,0.53,61.4,57.0,5.18,5.20,3.19,Kimberly,VS1,F,Ideal
40453,2368,0.66,60.8,58.0,5.67,5.64,3.44,Amsterdam,VS1,F,Premium


In [38]:
diamonds_encoded = pd.get_dummies(diamonds, columns=['city', 'cut', 'clarity', 'color'])

In [39]:
X = diamonds_encoded.drop('price', axis=1)
y = diamonds_encoded['price']

In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")


X_train: (32364, 39), X_test: (8091, 39), y_train: (32364,), y_test: (8091,)


In [41]:

# Calcular la matriz de correlación
correlation_matrix = diamonds_encoded.corr()

# Seleccionar la variable objetivo (en este ejemplo, 'price') y ordenar las correlaciones en orden descendente
target_correlation = correlation_matrix['price'].sort_values(ascending=False)

# Definir un umbral de correlación por encima del cual consideraremos que las características están altamente correlacionadas con la variable objetivo
correlation_threshold = 0.5

# Filtrar las características que tienen una correlación por encima del umbral
selected_features = target_correlation[abs(target_correlation) > correlation_threshold].index.tolist()

# Eliminar la variable objetivo de la lista de características seleccionadas
selected_features.remove('price')

# Seleccionar solo las características que pasan el umbral de correlación
df_selected = diamonds_encoded[selected_features]


In [47]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Entrenar el modelo de Random Forest
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)

# Realizar predicciones en el conjunto de prueba
y_pred = model.predict(X_test_scaled)

# Calcular el RMSE
rmse = mean_squared_error(y_test, y_pred, squared=False)
print("RMSE:", rmse)

RMSE: 589.7399644572525




In [50]:
diamonds_test = pd.read_csv('./data/diamonds_test.csv')

In [51]:
diamonds_encoded_test = pd.get_dummies(diamonds_test, columns=['city', 'cut', 'clarity', 'color'])

In [52]:
X_prueba = diamonds_encoded_test.drop('id', axis=1)
X_prueba_scaled = scaler.transform(X_prueba)
y_prueba_pred = model.predict(X_prueba_scaled)


In [71]:
solution = []
for i, value in enumerate(y_prueba_pred):
    solution.append((i,value))
    

In [72]:
solution_df = pd.DataFrame(solution)

In [73]:
solution_df.columns = ['id', 'price']

In [74]:
solution_df.to_csv('submission.csv', index=False)

In [75]:
solution_df

Unnamed: 0,id,price
0,0,2906.61
1,1,5401.23
2,2,9879.10
3,3,4064.38
4,4,1706.90
...,...,...
13480,13480,1653.98
13481,13481,2465.49
13482,13482,2845.77
13483,13483,2126.02
