In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load the data
data = pd.read_csv('bikes.csv')

# Preliminary data cleaning
data.drop(data.columns[data.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)

# Exploratory Data Analysis
## Univariate Analysis
plt.figure(figsize=(12, 6))
sns.countplot(data['price'])
plt.title('Distribution of Model Year')
plt.show()

## Bivariate Analysis
plt.figure(figsize=(12, 6))
sns.scatterplot(x='km_driven', y='price', data=data)
plt.title('Kilometers Driven vs Price')
plt.show()

## Multivariate Analysis
sns.pairplot(data[['km_driven', 'mileage', 'power', 'price']])
plt.show()

# Feature Engineering
# Convert categorical data to numerical if necessary (for model_name, location)
data['model_name'] = data['model_name'].astype('category').cat.codes
data['location'] = data['location'].astype('category').cat.codes

# Model Building
X = data[['model_year', 'km_driven', 'owner', 'mileage', 'power', 'model_name', 'location']]
y = data['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Model Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R2 Score:', r2)

# Plot predictions
plt.figure(figsize=(12, 6))
plt.scatter(y_test, y_pred)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual Prices vs Predicted Prices')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red')  # Diagonal line
plt.show()
