# Regresion Lineal

## 1. Importacion de Modulos

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

## 2. Lectura del dataset

In [2]:
data = pd.read_csv('Anexo2-Dataset_Vehicle/car_details_v4.csv')

In [None]:
valores_nan_por_columna = data.isnull().sum()

# Mostrar el número de NaN por columna
print(valores_nan_por_columna)

### Limpieza de datos

In [4]:
data = data.dropna() # Eliminacion de Filas con datos nulos

In [5]:
# Conversion de valores str a valores numericos
data['Engine'] = data['Engine'].str.replace(' cc', '').astype(float)
data['Max Power'] = data['Max Power'].str.extract(r'(\d+\.?\d*)').astype(float)
data['Max Torque'] = data['Max Torque'].str.extract(r'(\d+\.?\d*)').astype(float)

## 3. Seleccionar las variables dependiente e independiente

In [6]:
y = data['Price'] # Var Dependiente
X = data[['Year', 'Kilometer', 'Fuel Type', 'Transmission', 'Engine', 'Max Power', 
          'Max Torque', 'Seating Capacity', 'Fuel Tank Capacity', 'Length', 'Width', 'Height']] # Var Independiente

## 4. Aplicamos One Hot Coding

In [7]:
# Convertir variables categóricas a numéricas utilizando One-Hot Encoding
X = pd.get_dummies(X, columns=['Fuel Type', 'Transmission'], drop_first=True)

## 5. Dividir el dataset en conjunto de entrenamiento y de pruebas

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 6. Crear y entrenar el modelo de Regresion Lineal

In [None]:
# Crear el modelo
modelo = LinearRegression()

# Entrenar el modelo
modelo.fit(X_train, y_train)

## Hacer Predicciones

In [10]:
y_pred = modelo.predict(X_test)

## Evaluar el modelo

In [None]:
# Evaluar el modelo
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

In [None]:
print("Dimensiones de X:", X.shape)  # Número de filas y columnas en X
print("Dimensiones de y:", y.shape) 

## Visualizar de resultados

In [None]:
plt.scatter(X['Year'], y, alpha=0.5)
plt.xlabel('Year')
plt.ylabel('Price')
plt.title('Price vs Max Power')
plt.grid(True)
plt.show()

In [None]:
plt.scatter(y_test, y_pred)
plt.xlabel('Valores Reales')
plt.ylabel('Predicciones')
plt.title('Valores Reales vs Predicciones')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # Línea de referencia
plt.show()

## Prediccion con un nuevo registro

In [15]:
new_data = {
    'Year': [2022],
    'Kilometer': [15000],
    'Fuel Type': ['Petrol'],
    'Transmission': ['Manual'],
    'Engine': ['1995 cc'],
    'Max Power': ['150 bhp @ 6000 rpm'],
    'Max Torque': ['200 Nm @ 4000 rpm'],
    'Seating Capacity': [5],
    'Fuel Tank Capacity': [50],
    'Length': [4500],
    'Width': [1800],
    'Height': [1400]
}

new_df = pd.DataFrame(new_data)
new_df['Max Power'] = new_df['Max Power'].str.extract(r'(\d+\.?\d*)').astype(float)
new_df['Max Torque'] = new_df['Max Torque'].str.extract(r'(\d+\.?\d*)').astype(float)

# Convertir variables categóricas en variables dummy
new_df = pd.get_dummies(new_df, drop_first=True)
for col in X.columns:
    if col not in new_df.columns:
        new_df[col] = 0  # Agregar la columna faltante y llenarla con ceros

new_df = new_df[X.columns] 

In [16]:
predicted_price = modelo.predict(new_df)

In [None]:
predicted_price