# Regresión Lineal: Predicción de Precios de Autos

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
import sklearn
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, RobustScaler, StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('carDataset/toyota.csv')
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,GT86,2016,16000,Manual,24089,Petrol,265,36.2,2.0
1,GT86,2017,15995,Manual,18615,Petrol,145,36.2,2.0
2,GT86,2015,13998,Manual,27469,Petrol,265,36.2,2.0
3,GT86,2017,18998,Manual,14736,Petrol,150,36.2,2.0
4,GT86,2017,17498,Manual,36284,Petrol,145,36.2,2.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6738 entries, 0 to 6737
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         6738 non-null   object 
 1   year          6738 non-null   int64  
 2   price         6738 non-null   int64  
 3   transmission  6738 non-null   object 
 4   mileage       6738 non-null   int64  
 5   fuelType      6738 non-null   object 
 6   tax           6738 non-null   int64  
 7   mpg           6738 non-null   float64
 8   engineSize    6738 non-null   float64
dtypes: float64(2), int64(4), object(3)
memory usage: 473.9+ KB


In [4]:
categ = ['model','year','transmission','fuelType','engineSize']
str_categ = ['model','transmission','fuelType']
numer = ['price','mileage','tax','mpg']

## Codificación One-Hot

Una de las formas de codificar categorías no numéricas se conoce como _one-hot encoding_, en donde se crea una columna para cada valor distinto que exista en la característica que estamos codificando y, para cada registro, marcar con un 1 la columna a la que pertenezca dicho registro y dejar las demás con 0.

In [153]:
data_ohe = pd.get_dummies(df)
# scl = RobustScaler()
scl = MinMaxScaler()
data_scl_ohe = scl.fit_transform(data_ohe)
data_scl_ohe = pd.DataFrame(data_scl_ohe, columns = data_ohe.columns)
data_scl_ohe.head()

Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_ Auris,model_ Avensis,model_ Aygo,model_ C-HR,...,model_ Verso-S,model_ Yaris,transmission_Automatic,transmission_Manual,transmission_Other,transmission_Semi-Auto,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,0.818182,0.25615,0.1381,0.469027,0.143842,0.444444,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.863636,0.256066,0.106716,0.256637,0.143842,0.444444,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.772727,0.222301,0.157479,0.469027,0.143842,0.444444,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.863636,0.306839,0.084476,0.265487,0.143842,0.444444,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.863636,0.281478,0.208019,0.256637,0.143842,0.444444,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0


### Separación de Datos

In [154]:
X = data_scl_ohe.drop(['price'], axis=1)
y = data_scl_ohe['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Parte 1

A continuación se realizó el ajuste de los datos para un modelo de regresión lineal definido mediante la librería SciKit-Learn. Se observan dos métricas de precisión, en este caso Error Cuadrático Medio (RMSE) y $R^2$.

In [155]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)
preds = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
rs = model.score(X_test, y_test)
print(f'RMSE: {round(rmse,3)}')
print(f'R^2: {round(rs,3)}\n')

RMSE: 0.03
R^2: 0.927



## Parte 2

Ahora usted debe desarrollar su propia implementación del método Descenso de Gradiente estocástico.

In [156]:
X_train_array = np.c_[np.ones(X_train.values.shape[0]), X_train.values]
y_train_array = y_train.values

#Inicialización de w
w0 = np.random.rand(X_train_array.shape[1])
w = w0.copy()
#Inicialización de criterio de parada (al disminuir el valor de eps, se aumenta la precisión y el tiempo de procesamiento)
eps = 1e-5
diff = 100000
error = 10000
#Inicialización de valor de la tasa de aprendizaje 
tasa = 0.03

#Ciclo iterativo según algoritmo de descenso de gradiente
while error>eps:
    i = np.random.randint(X_train_array.shape[0])
    xActual = X_train_array[i]
    yActual = y_train_array[i]
    
    pred = np.dot(xActual, w)
    e = pred - yActual
    w = w0 - (tasa * np.dot(xActual.T, e))
    error = np.linalg.norm(e)
    w0 = w.copy()
print(f'Error {round(np.linalg.norm(e),3)}')

Error 0.0


In [157]:
y_preds = np.zeros(y_train_array.shape)
for i in range(X_train_array.shape[0]):
    y_preds[i] = np.dot(w,X_train_array[i])

In [158]:
r2 = r2_score(y_train_array, y_preds)
r2

0.7068400212087667

# -------------------------------------------------------------------------------------------------------------

### Codificación Ordinal para Categorías No-Numéricas

Cada clase a continuación se codificó a partir de un orden, partiendo desde 0 hasta el número total de clases menos 1

In [5]:
for str_cat in str_categ:
    print(f'{str_cat}:')
    for class_element in df[str_cat].unique():
        print(f'- {class_element}')

model:
-  GT86
-  Corolla
-  RAV4
-  Yaris
-  Auris
-  Aygo
-  C-HR
-  Prius
-  Avensis
-  Verso
-  Hilux
-  PROACE VERSO
-  Land Cruiser
-  Supra
-  Camry
-  Verso-S
-  IQ
-  Urban Cruiser
transmission:
- Manual
- Automatic
- Semi-Auto
- Other
fuelType:
- Petrol
- Other
- Hybrid
- Diesel


In [83]:
oe = OrdinalEncoder()
df_oe = df.copy()
df_oe[str_categ] = oe.fit_transform(df_oe[str_categ])

In [84]:
x = df_oe.values #returns a numpy array
min_max_scaler = MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df_scaled = pd.DataFrame(df_oe)
df_scaled.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,6.0,2016,16000,1.0,24089,3.0,265,36.2,2.0
1,6.0,2017,15995,1.0,18615,3.0,145,36.2,2.0
2,6.0,2015,13998,1.0,27469,3.0,265,36.2,2.0
3,6.0,2017,18998,1.0,14736,3.0,150,36.2,2.0
4,6.0,2017,17498,1.0,36284,3.0,145,36.2,2.0


In [85]:
df_oe.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,6.0,2016,16000,1.0,24089,3.0,265,36.2,2.0
1,6.0,2017,15995,1.0,18615,3.0,145,36.2,2.0
2,6.0,2015,13998,1.0,27469,3.0,265,36.2,2.0
3,6.0,2017,18998,1.0,14736,3.0,150,36.2,2.0
4,6.0,2017,17498,1.0,36284,3.0,145,36.2,2.0


### Separación de Datos

In [86]:
X = df_oe.drop(['price'], axis=1)
y = df_oe['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)