<a href="https://colab.research.google.com/github/anushkahedaoo19/price/blob/main/Second_hand_car_price_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from google.colab import drive
drive.mount('/content/drive')

# Load the dataset
url = '/content/drive/MyDrive/Car_sales.csv'  # Replace with your file path or URL
df = pd.read_csv(url)

# Show dataset shape
print("Dataset Shape:", df.shape)

# Convert numeric columns stored as objects to proper numeric types
numeric_cols = ['Price in thousands', '4-year resale value', 'Engine size',
                'Horsepower', 'Wheelbase', 'Width', 'Length',
                'Curb weight', 'Fuel capacity', 'Fuel efficiency']

for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Remove non-numeric columns and rows with missing target values
df = df.drop(['Manufacturer', 'Model', 'Latest Launch', 'Vehicle type'], axis=1)
df = df.dropna(subset=['Price in thousands'])  # Drop rows where target is missing

# Fill remaining missing values with column means
df = df.fillna(df.mean())

# Separate features (X) and target variable (y)
X = df.drop('Price in thousands', axis=1)
y = df['Price in thousands']

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ======================================================================
# MODEL TRAINING AND EVALUATION - 70/30 SPLIT
# ======================================================================
print("="*50)
print("70-30 SPLIT RESULTS")
print("="*50)

# Split data into 70% training and 30% test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# 1. LINEAR REGRESSION MODEL
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("\nLinear Regression (70-30 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 2. DECISION TREE MODEL
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print("\nDecision Tree (70-30 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 3. RANDOM FOREST MODEL
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("\nRandom Forest (70-30 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 4. GRADIENT BOOSTING MODEL
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print("\nGradient Boosting (70-30 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 5. K-NEAREST NEIGHBORS MODEL
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("\nKNN (70-30 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 6. SUPPORT VECTOR MACHINE (SVM) MODEL
svm = SVR(kernel='rbf')  # Using Radial Basis Function (RBF) kernel
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("\nSupport Vector Machine (70-30 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# ======================================================================
# MODEL TRAINING AND EVALUATION - 80/20 SPLIT
# ======================================================================
print("\n" + "="*50)
print("80-20 SPLIT RESULTS")
print("="*50)

# Split data into 80% training and 20% test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 1. LINEAR REGRESSION MODEL (80-20 split)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("\nLinear Regression (80-20 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 2. DECISION TREE MODEL (80-20 split)
dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print("\nDecision Tree (80-20 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 3. RANDOM FOREST MODEL (80-20 split)
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print("\nRandom Forest (80-20 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 4. GRADIENT BOOSTING MODEL (80-20 split)
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)
print("\nGradient Boosting (80-20 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 5. K-NEAREST NEIGHBORS MODEL (80-20 split)
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print("\nKNN (80-20 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))

# 6. SUPPORT VECTOR MACHINE (SVM) MODEL (80-20 split)
svm = SVR(kernel='rbf')  # Using Radial Basis Function (RBF) kernel
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("\nSupport Vector Machine (80-20 split):")
print("MSE:", mean_squared_error(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R2 Score:", r2_score(y_test, y_pred))


Mounted at /content/drive
Dataset Shape: (157, 15)
70-30 SPLIT RESULTS

Linear Regression (70-30 split):
MSE: 25.050340010290956
MAE: 3.839195283067921
R2 Score: 0.8880303003411344

Decision Tree (70-30 split):
MSE: 31.290080680851062
MAE: 4.032765957446809
R2 Score: 0.8601399847388389

Random Forest (70-30 split):
MSE: 29.44778756880416
MAE: 3.594292340425528
R2 Score: 0.8683746436837773

Gradient Boosting (70-30 split):
MSE: 28.20122980410836
MAE: 3.6434344001663765
R2 Score: 0.8739464921482323

KNN (70-30 split):
MSE: 65.09353556425532
MAE: 5.134008510638298
R2 Score: 0.7090457205822692

Support Vector Machine (70-30 split):
MSE: 181.5210414883403
MAE: 7.039360109750764
R2 Score: 0.18863949595637142

80-20 SPLIT RESULTS

Linear Regression (80-20 split):
MSE: 27.711210211542763
MAE: 4.003024356738472
R2 Score: 0.8710897390858895

Decision Tree (80-20 split):
MSE: 27.410940129032248
MAE: 3.8290322580645157
R2 Score: 0.8724865707069426

Random Forest (80-20 split):
MSE: 30.598511868296