In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# For data splitting
from sklearn.model_selection import train_test_split, cross_val_score

# For scaling
from sklearn.preprocessing import StandardScaler

# For modeling
from sklearn.linear_model import LinearRegression, Ridge, Lasso

# For evaluation
from sklearn.metrics import mean_squared_error, r2_score

# For pipelines
from sklearn.pipeline import Pipeline

# For GridSearch
from sklearn.model_selection import GridSearchCV

# Import additional estimators
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

import xgboost as xgb
from xgboost import XGBRegressor



In [2]:
df = pd.read_csv('data/auto_scout_final.csv')
# Create features and targey variable
X = df.drop('price', axis = 1)
y = df['price']

In [3]:
# Split the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

In [4]:
# Standard scaler
standard_scaler = StandardScaler()
X_train_scaled = standard_scaler.fit_transform(X_train) # train the scalar on the train dataset
X_test_scaled = standard_scaler.transform(X_test) # transform test datasets, make sure you don't fit to test data though!

Run and initial pass of all the major types of estimators (not fine-tuned) on the data and compare performance on train and test datasets. Try unscaled data first:

In [5]:
# empty dict to store evaluation results
results = {}

models = {'SVR': SVR(),
          'Random Forest': RandomForestRegressor(),
          'Gradient Boost': GradientBoostingRegressor(),
          'Lasso Regression': Lasso(),
          'XG Boost': XGBRegressor()}

for model_name, model in models.items():
    #unscaled data
    model.fit(X_train, y_train)
    # make predictions on the training data and the test data
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    # Gather evaluation metrics mean_squared_error and r2_score on both train and test datasets
    train_r2 = r2_score(y_train, y_train_pred)
    test_r2 = r2_score(y_test, y_test_pred)
    train_MSE = mean_squared_error(y_train, y_train_pred)
    test_MSE = mean_squared_error(y_test, y_test_pred)
    train_RMSE = np.sqrt(train_MSE)
    test_RMSE = np.sqrt(test_MSE)
    # put these results into my results dict
    results[model_name] = {
        'Train R^2':train_r2,
        'Test R^2':test_r2,
        'Train RMSE':train_RMSE,
        'Test RMSE': test_RMSE
    }

# convert dict into df
results_unscaled = pd.DataFrame(results)
    

SyntaxError: ':' expected after dictionary key (2310196543.py, line 8)

Log transforming the target variable improved linear regression a bit, can it improve the other models too?

In [27]:
# Apply log transformation to the target variable
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

TypeError: 'builtin_function_or_method' object is not iterable