In [50]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score


from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [51]:
from google.colab import drive
drive.mount('/content/drive')

# train_data_path = '/content/drive/My Drive/train_data.csv'
# test_data_path='/content/drive/My Drive/test_data.csv'
vehicle_data_path = '/content/drive/My Drive/vehicle_dataset/vehicle_dataset.csv'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [52]:
# load data
vehicle_data=pd.read_csv(vehicle_data_path)
vehicle_data = vehicle_data.dropna()


In [53]:
X = vehicle_data.drop(columns=['price'])
y = vehicle_data['price']

In [54]:
columns=X.columns
columns

Index(['year', 'manufacturer', 'model', 'condition', 'cylinders', 'fuel',
       'odometer', 'title_status', 'transmission', 'drive', 'size', 'type',
       'paint_color', 'state', 'lat', 'long'],
      dtype='object')

In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Print X_train: {}'.format(X_train.shape[0]))
print('Print y_train: {}'.format(y_train.shape[0]))
print('Print X_test: {}'.format(X_test.shape[0]))
print('Print y_test: {}'.format(y_test.shape[0]))

Print X_train: 46512
Print y_train: 46512
Print X_test: 11628
Print y_test: 11628


In [56]:
# X_test = X_test[X_test['model'].isin(X_train['model'])]
# print('Print X_test: {}'.format(X_test.shape[0]))

# # Remove corresponding rows from y_test
# y_test = y_test[y_test.index.isin(X_test.index)]
# print('Print y_test: {}'.format(y_test.shape[0]))

# Iterate over each column in X_test
for column in X_test.columns:
    # Remove rows from X_test based on the column values present in X_train
    X_test = X_test[X_test[column].isin(X_train[column])]
    print('Print X_test after removing rows based on {}: {}'.format(column, X_test.shape[0]))

    # Remove corresponding rows from y_test
    y_test = y_test[y_test.index.isin(X_test.index)]
    print('Print y_test after removing rows based on {}: {}'.format(column, y_test.shape[0]))


Print X_test after removing rows based on year: 11625
Print y_test after removing rows based on year: 11625
Print X_test after removing rows based on manufacturer: 11625
Print y_test after removing rows based on manufacturer: 11625
Print X_test after removing rows based on model: 11059
Print y_test after removing rows based on model: 11059
Print X_test after removing rows based on condition: 11059
Print y_test after removing rows based on condition: 11059
Print X_test after removing rows based on cylinders: 11059
Print y_test after removing rows based on cylinders: 11059
Print X_test after removing rows based on fuel: 11059
Print y_test after removing rows based on fuel: 11059
Print X_test after removing rows based on odometer: 7909
Print y_test after removing rows based on odometer: 7909
Print X_test after removing rows based on title_status: 7909
Print y_test after removing rows based on title_status: 7909
Print X_test after removing rows based on transmission: 7909
Print y_test afte

In [57]:
# imputer = SimpleImputer(strategy='most_frequent')

# # Fit the imputer on the training data and transform both training and test data
# X_train= imputer.fit_transform(X_train)
# X_test= imputer.transform(X_test)

In [58]:
# print(X_train)

In [59]:
# X_train = pd.DataFrame(X_train, columns=columns)
# X_test = pd.DataFrame(X_test, columns=columns)

label_encoders = {}

# Encode categorical features in training data
for feature in X_train.columns[X_train.dtypes == 'object']:
    label_encoders[feature] = LabelEncoder()
    X_train[feature] = label_encoders[feature].fit_transform(X_train[feature])


print(X_train.shape[0])

print(label_encoders)

for feature in X_test.columns[X_test.dtypes == 'object']:
    X_test[feature] = label_encoders[feature].transform(X_test[feature])


46512
{'manufacturer': LabelEncoder(), 'model': LabelEncoder(), 'condition': LabelEncoder(), 'cylinders': LabelEncoder(), 'fuel': LabelEncoder(), 'title_status': LabelEncoder(), 'transmission': LabelEncoder(), 'drive': LabelEncoder(), 'size': LabelEncoder(), 'type': LabelEncoder(), 'paint_color': LabelEncoder(), 'state': LabelEncoder()}


In [60]:
regressor = DecisionTreeRegressor(random_state = 0)

In [61]:
regressor.fit(X_train,y_train)

In [62]:
model_name = type(regressor).__name__
print(model_name)

# Making predictions on the same data or new data
predictions = regressor.predict(X_train)

print('\nTraining Scores')

# Evaluating the model
mse = mean_squared_error(y_train, predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_train, predictions)
print(f'R-squared: {r2}')

test_predictions = regressor.predict(X_test)


print('\nTest Scores')
# Evaluating the model
mse = mean_squared_error(y_test, test_predictions)
print(f'Mean Squared Error: {mse}')

r2 = r2_score(y_test, test_predictions)
print(f'R-squared: {r2}')

DecisionTreeRegressor

Training Scores
Mean Squared Error: 218456545897.98773
R-squared: 0.999332926999561

Test Scores
Mean Squared Error: 28503294.74418471
R-squared: 0.8034276258933973
