In [76]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error

In [11]:
diamantes = pd.read_csv('/Users/anadeondarza/Desktop/ironhack_data/Proyecto_Modulo_3_Anadeondarza/Diamon/diamantes.csv')
diamantes = diamantes.drop('Unnamed: 0', axis=1)
diamantes

Unnamed: 0,city,price,carat,clarity,color,cut,depth,table,x,y,z
0,Amsterdam,6216,1.00,VS2,G,Premium,61.6,54.0,6.49,6.46,3.99
1,Amsterdam,675,0.30,VS2,G,Premium,60.6,60.0,4.34,4.30,2.62
2,Amsterdam,1841,0.62,VS2,G,Premium,61.1,59.0,5.56,5.50,3.38
3,Amsterdam,720,0.32,VS2,G,Premium,60.2,57.0,4.45,4.42,2.67
4,Amsterdam,8874,1.39,VS2,G,Premium,62.0,56.0,7.18,7.14,4.44
...,...,...,...,...,...,...,...,...,...,...,...
40450,Antwerp,2211,0.47,IF,D,Fair,60.6,60.0,5.09,4.98,3.05
40451,Zurich,5460,1.50,I1,D,Fair,64.7,62.0,7.19,7.04,4.60
40452,Kimberly,2491,0.91,I1,D,Fair,66.2,57.0,6.00,5.94,3.95
40453,Surat,15964,3.40,I1,D,Fair,66.8,52.0,9.42,9.34,6.27


In [13]:
diamantes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40455 entries, 0 to 40454
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   city     40455 non-null  object 
 1   price    40455 non-null  int64  
 2   carat    40455 non-null  float64
 3   clarity  40455 non-null  object 
 4   color    40455 non-null  object 
 5   cut      40455 non-null  object 
 6   depth    40455 non-null  float64
 7   table    40455 non-null  float64
 8   x        40455 non-null  float64
 9   y        40455 non-null  float64
 10  z        40455 non-null  float64
dtypes: float64(6), int64(1), object(4)
memory usage: 3.4+ MB


In [14]:
diamantes.describe()

Unnamed: 0,price,carat,depth,table,x,y,z
count,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0,40455.0
mean,3928.444469,0.797706,61.752841,57.446133,5.729392,5.732819,3.537154
std,3992.416147,0.475544,1.431725,2.233535,1.124453,1.14665,0.697062
min,326.0,0.2,43.0,43.0,0.0,0.0,0.0
25%,945.0,0.4,61.0,56.0,4.71,4.72,2.91
50%,2397.0,0.7,61.8,57.0,5.69,5.71,3.52
75%,5331.0,1.04,62.5,59.0,6.54,6.54,4.035
max,18823.0,4.5,79.0,95.0,10.23,58.9,8.06


In [16]:
diamantes.columns

Index(['city', 'price', 'carat', 'clarity', 'color', 'cut', 'depth', 'table',
       'x', 'y', 'z'],
      dtype='object')

In [22]:
diamantes.rename(columns={'price':'target'},
               inplace=True)

In [24]:
diamantes

Unnamed: 0,city,target,carat,clarity,color,cut,depth,table,x,y,z
0,Amsterdam,6216,1.00,VS2,G,Premium,61.6,54.0,6.49,6.46,3.99
1,Amsterdam,675,0.30,VS2,G,Premium,60.6,60.0,4.34,4.30,2.62
2,Amsterdam,1841,0.62,VS2,G,Premium,61.1,59.0,5.56,5.50,3.38
3,Amsterdam,720,0.32,VS2,G,Premium,60.2,57.0,4.45,4.42,2.67
4,Amsterdam,8874,1.39,VS2,G,Premium,62.0,56.0,7.18,7.14,4.44
...,...,...,...,...,...,...,...,...,...,...,...
40450,Antwerp,2211,0.47,IF,D,Fair,60.6,60.0,5.09,4.98,3.05
40451,Zurich,5460,1.50,I1,D,Fair,64.7,62.0,7.19,7.04,4.60
40452,Kimberly,2491,0.91,I1,D,Fair,66.2,57.0,6.00,5.94,3.95
40453,Surat,15964,3.40,I1,D,Fair,66.8,52.0,9.42,9.34,6.27


In [23]:
y = diamantes.target
y

0         6216
1          675
2         1841
3          720
4         8874
         ...  
40450     2211
40451     5460
40452     2491
40453    15964
40454     5617
Name: target, Length: 40455, dtype: int64

In [25]:
features = ['city', 'carat', 'clarity', 'color', 'cut', 'depth', 'table','x', 'y', 'z']

In [30]:
cols = ['city', 'target', 'carat', 'clarity', 'color', 'cut', 'depth', 'table','x', 'y', 'z']

cat_list = []
for col in cols:
    cat = diamantes[col].unique()
    cat_num = len(cat)
    cat_dict = {"categorical_variable":col,
                "number_of_possible_values":cat_num,
                "values":cat}
    cat_list.append(cat_dict)
    
categories = pd.DataFrame(cat_list).sort_values(by="number_of_possible_values",
                                                ascending=False).reset_index(drop=True)
categories

Unnamed: 0,categorical_variable,number_of_possible_values,values
0,target,10378,"[6216, 675, 1841, 720, 8874, 847, 561, 579, 76..."
1,x,543,"[6.49, 4.34, 5.56, 4.45, 7.18, 4.83, 4.41, 6.7..."
2,y,539,"[6.46, 4.3, 5.5, 4.42, 7.14, 4.89, 4.44, 6.81,..."
3,z,363,"[3.99, 2.62, 3.38, 2.67, 4.44, 2.89, 2.77, 4.1..."
4,carat,270,"[1.0, 0.3, 0.62, 0.32, 1.39, 0.42, 0.33, 1.2, ..."
5,depth,175,"[61.6, 60.6, 61.1, 60.2, 62.0, 59.5, 60.5, 62...."
6,table,121,"[54.0, 60.0, 59.0, 57.0, 56.0, 58.0, 62.0, 61...."
7,city,13,"[Amsterdam, Zurich, Las Vegas, New York City, ..."
8,clarity,8,"[VS2, SI2, VS1, SI1, VVS1, VVS2, IF, I1]"
9,color,7,"[G, H, I, E, F, J, D]"


In [31]:
diamantes.isnull().sum()

city       0
target     0
carat      0
clarity    0
color      0
cut        0
depth      0
table      0
x          0
y          0
z          0
dtype: int64

In [32]:
def missing_percentage(df):
    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_values_df = pd.DataFrame({'column_name': df.columns,'percent_missing': percent_missing})
    return missing_values_df

In [33]:
missing_percentage(diamantes)

Unnamed: 0,column_name,percent_missing
city,city,0.0
target,target,0.0
carat,carat,0.0
clarity,clarity,0.0
color,color,0.0
cut,cut,0.0
depth,depth,0.0
table,table,0.0
x,x,0.0
y,y,0.0


Label Encoder

In [39]:
label_encoder = LabelEncoder()

In [44]:
for x in diamantes.columns:
    diamantes[x] = label_encoder.fit_transform(diamantes[x])
    
diamantes

Unnamed: 0,city,target,carat,clarity,color,cut,depth,table,x,y,z
0,0,5281,80,5,3,3,78,19,271,271,178
1,0,322,10,5,3,3,68,79,56,55,41
2,0,1380,42,5,3,3,73,69,178,175,117
3,0,367,12,5,3,3,64,49,67,67,46
4,0,6975,119,5,3,3,82,39,340,339,223
...,...,...,...,...,...,...,...,...,...,...,...
40450,1,1740,27,1,0,0,68,79,131,123,84
40451,12,4672,130,0,0,0,109,97,341,329,239
40452,3,2009,71,0,0,0,124,49,222,219,174
40453,10,9665,263,0,0,0,130,6,531,526,358


In [55]:
X = diamantes[features]
X

Unnamed: 0,city,carat,clarity,color,cut,depth,table,x,y,z
0,0,80,5,3,3,78,19,271,271,178
1,0,10,5,3,3,68,79,56,55,41
2,0,42,5,3,3,73,69,178,175,117
3,0,12,5,3,3,64,49,67,67,46
4,0,119,5,3,3,82,39,340,339,223
...,...,...,...,...,...,...,...,...,...,...
40450,1,27,1,0,0,68,79,131,123,84
40451,12,130,0,0,0,109,97,341,329,239
40452,3,71,0,0,0,124,49,222,219,174
40453,10,263,0,0,0,130,6,531,526,358


In [56]:
diamantes['city'].unique()

array([ 0, 12,  4,  8,  9, 11,  2,  3, 10,  1,  7,  5,  6])

In [57]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")

X_train: (32364, 10), X_test: (8091, 10), y_train: (32364,), y_test: (8091,)


In [58]:
regressor = LinearRegression()
#regressor = RandomForestRegressor()
hyperparameters = regressor.get_params()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

In [84]:
len(y_pred)

8091

In [85]:
len(y_test)

8091

In [60]:
y_pred

array([-820.73225588, 7048.51881255, 9603.93564617, ..., -476.45970594,
       4877.59204571, 2544.87551451])

In [87]:
rmse = mean_squared_error(y_pred, y_test, squared=False)
rmse

1318.3893362625795

In [62]:
diamantes_predicts = pd.read_csv('/Users/anadeondarza/Desktop/ironhack_data/Proyecto_Modulo_3_Anadeondarza/Diamon/diamonds_test.csv')
diamantes_predicts

Unnamed: 0,id,carat,cut,color,clarity,depth,table,x,y,z,city
0,0,0.79,Very Good,F,SI1,62.7,60.0,5.82,5.89,3.67,Amsterdam
1,1,1.20,Ideal,J,VS1,61.0,57.0,6.81,6.89,4.18,Surat
2,2,1.57,Premium,H,SI1,62.2,61.0,7.38,7.32,4.57,Kimberly
3,3,0.90,Very Good,F,SI1,63.8,54.0,6.09,6.13,3.90,Kimberly
4,4,0.50,Very Good,F,VS1,62.9,58.0,5.05,5.09,3.19,Amsterdam
...,...,...,...,...,...,...,...,...,...,...,...
13480,13480,0.57,Ideal,E,SI1,61.9,56.0,5.35,5.32,3.30,Amsterdam
13481,13481,0.71,Ideal,I,VS2,62.2,55.0,5.71,5.73,3.56,New York City
13482,13482,0.70,Ideal,F,VS1,61.6,55.0,5.75,5.71,3.53,Tel Aviv
13483,13483,0.70,Very Good,F,SI2,58.8,57.0,5.85,5.89,3.45,Surat


In [63]:
diamantes_predicts.columns

Index(['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y',
       'z', 'city'],
      dtype='object')

In [69]:
diamantes_predicts.isnull().sum()

id         0
carat      0
cut        0
color      0
clarity    0
depth      0
table      0
x          0
y          0
z          0
city       0
dtype: int64

In [70]:
diamantes_predicts = diamantes_predicts.drop('id', axis=1)

In [71]:
for x in diamantes_predicts.columns:
    diamantes_predicts[x] = label_encoder.fit_transform(diamantes_predicts[x])
    
diamantes_predicts

Unnamed: 0,carat,cut,color,clarity,depth,table,x,y,z,city
0,57,4,2,2,78,65,202,207,138,0
1,98,2,6,4,61,41,301,307,189,10
2,135,3,4,2,73,74,358,350,228,3
3,68,4,2,2,89,12,229,231,161,3
4,28,4,2,4,80,51,125,127,90,0
...,...,...,...,...,...,...,...,...,...,...
13480,35,2,1,2,70,32,155,150,101,0
13481,49,2,5,5,73,22,191,191,127,8
13482,48,2,2,4,67,22,195,189,124,11
13483,48,4,2,3,39,41,205,207,116,10


In [88]:
final = regressor.predict(diamantes_predicts)
final

Feature names must be in the same order as they were in fit.



array([   -5.90916483, -3248.10846677, -3766.76478586, ...,
        -687.91826182, -2910.61268305,  2533.22940095])

In [91]:
predicciones = pd.DataFrame(final).rename(columns = {0:'price'})
predicciones.index.names = ['id']
predicciones 

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
0,-5.909165
1,-3248.108467
2,-3766.764786
3,808.986045
4,1838.038399
...,...
13480,741.528852
13481,324.939655
13482,-687.918262
13483,-2910.612683


In [None]:
predicciones 

In [92]:
    predicciones = predicciones.to_csv('predicciones .csv')