In [None]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
diamonds_df = pd.read_csv('../data/raws/diamonds_train.csv')
diamonds_predict = pd.read_csv('../data/raws/diamonds_predict.csv')

In [None]:
diamonds_df.head()

In [None]:
diamonds_predict.head()

In [None]:
diamonds_df.info()

In [None]:
diamonds_df.describe()

In [None]:
diamonds_df.shape

In [None]:
diamonds_predict.shape

## Data clean

In [None]:
diamonds_df = diamonds_df.replace(0, np.nan)

In [None]:
diamonds_df.isnull().sum()

In [None]:
diamonds_df.dropna(inplace=True)

In [None]:
diamonds_df.shape

In [None]:
diamonds_df= diamonds_df[diamonds_df['y'] < 50]

In [None]:
diamonds_df.describe()

In [None]:
diamonds_df.shape

In [None]:
diamonds_df_corr = diamonds_df.corr()
diamonds_df_corr

In [None]:
corr_price = diamonds_df.corr() # We already examined Price correlations
plt.figure(figsize=(12, 10))

sns.heatmap(corr_price[(corr_price >= 0.5) | (corr_price <= -0.5)], 
            cmap='YlGnBu', vmax=1.0, vmin=-1.0, linewidths=0.1,
            annot=True, annot_kws={"size": 8}, square=True);

## Estudio de los diamantes

La relación del precio de los diamantes no es líneal (mirar gráficos del proyecto 2) si no que es exponencial por lo que calculamos el log del carat log densidad es lo que más ajusta en diamantes pqueños, log carat en medianos y clarity + color en los de +2 carats

In [None]:
#Cambiamos en train y añadimos columna de logaritmo de carat

diamonds_df['logaritmo carat'] = np.log(diamonds_df['carat'])

# Cambiamos en predict

diamonds_predict['logaritmo carat'] = np.log(diamonds_predict['carat'])

El ancho y largo de los diamantes influyen positivamente en el precio (correlación de 0.89 entre x, y). Creamos columna para conocer el ratio

In [None]:
#Cambiamos en train y añadimos columna de ratio

diamonds_df['x/y'] = diamonds_df['x'] / diamonds_df['y']

# Cambiamos en predict

diamonds_predict['x/y'] = diamonds_predict['x'] / diamonds_predict['y']

x, y, z son dimensiones que hacen referencia al volumen del diamante

In [None]:
# diamante es como un cono
#diamonds_df['volume_abajo'] = (1/3 * math.pi * (diamonds_df['x'] / 2)**2) * diamonds_df['z']
#diamonds_predict['volume_abajo'] = (1/3 * math.pi * (diamonds_predict['x'] / 2)**2) * diamonds_predict['z']

In [None]:
diamonds_df['volume'] = np.log(diamonds_df['x']*diamonds_df['y']*diamonds_df['z'])
diamonds_predict['volume'] = np.log(diamonds_predict['x']*diamonds_predict['y']*diamonds_predict['z'])

Calculamos densidad diamante

In [None]:
diamonds_df['log densidad'] = np.log(diamonds_df['volume']/ diamonds_df['carat'])
diamonds_predict['log densidad'] = np.log(diamonds_predict['volume'] / diamonds_predict['carat'])

Forma de los diamantes importante, el diamante con forma redonda es el que más caro cuesta, porque es el más demandado

In [None]:
forma = []
for i in diamonds_df['table'].index:
    if 54<diamonds_df['table'][i]<57 and 61<diamonds_df['depth'][i]<62.5:
        forma.append('Round')
    elif 52<diamonds_df['table'][i]<60 and 60<diamonds_df['depth'][i]<68:
        forma.append('Oval')
    elif 63<diamonds_df['table'][i]<69 and 69<diamonds_df['depth'][i]<76:
        forma.append('Princess')
    elif 58<diamonds_df['table'][i]<63 and 58<diamonds_df['depth'][i]<66:
        forma.append('Cushion')
    else:
        forma.append('others')

In [None]:
diamonds_df['forma'] = forma

In [None]:
diamonds_df['forma'].value_counts()

In [None]:
## predict

forma = []
for i in diamonds_predict['table'].index:
    if 54<diamonds_predict['table'][i]<57 and 61<diamonds_predict['depth'][i]<62.5:
        forma.append('Round')
    elif 52<diamonds_predict['table'][i]<60 and 60<diamonds_predict['depth'][i]<68:
        forma.append('Oval')
    elif 63<diamonds_predict['table'][i]<69 and 69<diamonds_predict['depth'][i]<76:
        forma.append('Princess')
    elif 58<diamonds_predict['table'][i]<63 and 58<diamonds_predict['depth'][i]<66:
        forma.append('Cushion')
    else:
        forma.append('others')

In [None]:
diamonds_predict['forma'] = forma

In [None]:
diamonds_predict['forma'].value_counts()

Tranformamos las variables categoricas, en este caso vamos a asignarle un valor numerico porque no todos las variables tienen la misma importancia

In [None]:
#Cambiamos en train

diamonds_df['cut']=diamonds_df['cut'].map({'Fair':0,'Good':1,'Very Good':2,'Premium':3, 'Ideal':4})
diamonds_df['color']=diamonds_df['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F': 4, 'E': 5, 'D':6})
diamonds_df['clarity']=diamonds_df['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,'VS1':4,'VVS2':5,'VVS1':6,'IF':7})

#cambiamos en predict

diamonds_predict['cut']=diamonds_predict['cut'].map({'Fair':0,'Good':1,'Very Good':2,'Premium':3, 'Ideal':4})
diamonds_predict['color']=diamonds_predict['color'].map({'J':0, 'I':1, 'H':2, 'G':3, 'F': 4, 'E': 5, 'D':6})
diamonds_predict['clarity']=diamonds_predict['clarity'].map({'I1':0,'SI2':1,'SI1':2,'VS2':3,'VS1':4,'VVS2':5,'VVS1':6,'IF':7})

In [None]:
diamonds_df['cut/wt'] = diamonds_df['cut']/diamonds_df['carat']
diamonds_df['color/wt'] = diamonds_df['color']/diamonds_df['carat']
diamonds_df['clarity/wt'] = diamonds_df['clarity']/diamonds_df['carat']

diamonds_predict['cut/wt'] = diamonds_predict['cut']/diamonds_predict['carat']
diamonds_predict['color/wt'] = diamonds_predict['color']/diamonds_predict['carat']
diamonds_predict['clarity/wt'] = diamonds_predict['clarity']/diamonds_predict['carat']
#diamonds_predict = diamonds_predict.drop(['cut','color','clarity','depth'], axis=1)

In [None]:
#5 lo más caro 1 lo menos caro
diamonds_df['forma']=diamonds_df['forma'].map({'Round':5, 'Oval': 12, 'Princess': 4, 'Cushion':3, 'others':1})
diamonds_predict['forma']=diamonds_predict['forma'].map({'Round':5, 'Oval': 12, 'Princess': 4, 'Cushion':3, 'others':1})

In [None]:
diamonds_df = diamonds_df.replace(0, np.nan)

In [None]:
diamonds_df.isnull().sum()

In [None]:
diamonds_df.dropna(inplace=True)

In [None]:
diamonds_predict = diamonds_predict.replace(0, np.nan)

In [None]:
diamonds_predict.isnull().sum()

In [None]:
diamonds_predict.dropna(inplace=True)

## Entrenamos al modelo

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import BaggingRegressor


from sklearn.metrics import mean_squared_error,r2_score

from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler


In [None]:
#(diamonds_df.drop(['cut', 'color', 'table', 'clarity'], axis=1)).shape

In [None]:
diamonds_predict.shape

In [None]:
sc = StandardScaler()
columns_df = ['cut', 'color', 'clarity', 'logaritmo carat', 'volume', 'log densidad', 
              'x/y']
X = sc.fit_transform(diamonds_df[columns_df])
y = diamonds_df['price'].values

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, y)

Excluimos x,y,z debido a su dependencia del quilate, adicionalmente se excluyen la profundidad y la mesa debido a su muy baja correlación con el precio

In [None]:
diamonds_df.head()

## Mejores modelos

## GradientBoostingRegressor

In [None]:
model = GradientBoostingRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [None]:
np.mean(-scores)

In [None]:
X_test = sc.transform(diamonds_predict[columns_df].values)
y_hat = model.predict(X_test).clip(0, 20000)
submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
submission.to_csv('../data/results_2/GradientBoostingRegressor_no_hiperparametros.csv', index=False)

## BaggingRegressor

In [None]:
model = BaggingRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [None]:
np.mean(-scores)

In [None]:
X_test = sc.transform(diamonds_predict[columns_df].values)
y_hat = model.predict(X_test).clip(0, 20000)
submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
submission.to_csv('../data/results_2/BaggingRegressor_no_hiperparametros.csv', index=False)

## lightgbm

In [None]:
model = lgb.LGBMRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

In [None]:
y_test = model.predict(x_test)
y_train = model.predict(x_test)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [None]:
np.mean(-scores)

In [None]:
X_test = sc.transform(diamonds_predict[columns_df].values)
y_hat = model.predict(X_test).clip(0, 20000)
submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
submission.to_csv('../data/results_2/lightgbm_hiper_densidad_log.csv', index=False)

In [None]:
d_train = lgb.Dataset(x_train, label=y_train)
d_valid = lgb.Dataset(x_test, label=y_test)

In [None]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'rmse',
    'max_depth':1,
    'verbose':10,
    'early_stopping_round': 20
}

In [None]:
n_estimators = 115
watchlist=[d_valid]

In [None]:
model = lgb.LGBMRegressor()

In [None]:
param_grid = {'num_leaves': [35, 40, 45, 50],
             'min_data_in_leaf': [10, 15, 20, 30, 40],
             'max_depth': [25, 30, 35, 40, 45]}

In [None]:
grid_search = RandomizedSearchCV(estimator = model,
                        param_distributions = param_grid,
                        cv = 5,
                        n_jobs = -1,
                        scoring = 'neg_root_mean_squared_error',)


In [None]:
grid_search.fit(X, y)

In [None]:
grid_search.best_estimator_

In [None]:
y_pred = grid_search.predict(X_test)

mean_squared_error(y_test, y_pred)**0.5

In [None]:
gbm = lgb.train(params,
                d_train,
                n_estimators,
                watchlist,
                verbose_eval=1)

In [None]:
x_test = sc.fit_transform(diamonds_df[columns_df])

In [None]:
y_hat = grid_search.predict(x_test)

In [None]:
#X_test = sc.transform(diamonds_predict[columns_df].values)
#y_hat = model.predict(X_test).clip(0, 20000)
#submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
#submission.to_csv('../data/results_2/lightgbm_no_hiperparametros_forma_diamantes.csv', index=False)

In [None]:
output = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_pred})
output.to_csv('../data/results_2/lightgbm_no_hiperparametros_forma_diamantes_final_submit_volumen.csv', index=False)

In [None]:
model = lgb.LGBMRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [None]:
np.mean(-scores)

In [None]:
X_test = sc.transform(diamonds_predict[columns_df].values)
y_hat = model.predict(X_test).clip(0, 20000)
submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
submission.to_csv('../data/results_2/lightgbm_no_hiperparametros_forma_diamantes.csv', index=False)

##  AdaBoostRegressor

In [None]:
model = AdaBoostRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [None]:
np.mean(-scores)

In [None]:
X_test = sc.transform(diamonds_predict[columns_df].values)
y_hat = model.predict(X_test).clip(0, 20000)
submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
submission.to_csv('../data/results_2/AdaBoostRegressor_no_hiperparametros.csv', index=False)

## RandomForestRegressor

In [None]:
model = RandomForestRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [None]:
np.mean(-scores)

In [None]:
X_test = sc.transform(diamonds_predict[columns_df].values)
y_hat = model.predict(X_test).clip(0, 20000)
submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
submission.to_csv('../data/results_2/RandomForestRegressor_no_hiperparametros.csv', index=False)

## xgboost

In [None]:
model = xgb.XGBRegressor(n_estimators=100)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [None]:
print(f"Mean squared error: {mean_squared_error(y_test, y_pred, squared=False)}")
R2 = r2_score(y_test,y_pred)
print(f"R Squared: {R2}")

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, 
                         X, 
                         y, 
                         scoring='neg_root_mean_squared_error', 
                         cv=5, n_jobs=-1)

In [None]:
np.mean(-scores)

In [None]:
X_test = sc.transform(diamonds_predict[columns_df].values)
y_hat = model.predict(X_test).clip(0, 20000)
submission = pd.DataFrame({'id': diamonds_predict['id'], 'price': y_hat})
submission.to_csv('../data/results_2/xgboost_no_hiperparametros_forma_diamantes.csv', index=False)

In [None]:
Parameter_grid = ({'n_estimators':[16, 32, 64, 128, 256, 512], 
                'max_depth:':[2, 4, 8],
                #'num_leaves': [31, 40, 60],
                'learning_rate': [0.005, 0.1],
                'bagging_fraction': [0.70, 0.75],
                'max_bin': [128, 256],
                'feature_fraction' : [0.75, 0.8],
                'bagging_frequency' : [0.70, 0.80]})

#'min_data_in_leaf': [15, 20, 25]

In [None]:
grid_search = RandomizedSearchCV(model,
                                 Parameter_grid,
                                 cv=5,
                                 verbose=5,
                                 scoring='neg_root_mean_squared_error',
                                 n_jobs=-1,
                                 n_iter=20)

In [None]:
grid_search.fit(X, y)