# import modules

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
df=pd.read_csv("/kaggle/input/car-price-prediction/CarPrice_Assignment.csv")

# Data Checks to perform 


* Check Missing values
* Check Duplicates
* Check data type
* Check the number of unique values of each column
* Check statistics of data set
* Check various categories present in the different categorical column


In [None]:
# first 10 rowes 
df.head(10)

In [None]:
# last 10 rowes 
df.tail(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.isna().sum()

### No Vissing Value

In [None]:
df.duplicated().sum()

### No Duplicated values 

In [None]:
df.dtypes

### data types match 

In [None]:
df.nunique()

In [None]:
df.describe()

### Insights from statistic:
    

    * Sizes, weights, and engine capacities vary across cars.

    * Price Range  Mini Price $5,118 ,  Maxi Price $45,400 and Average car price is about $13,276.




In [None]:
# this is the values of the category columns exclude car Name 
categorical_columns = df.select_dtypes(exclude='number').copy()
for col in categorical_columns:
    if col !='CarName':
        print(f"Category in {col} is : {df[col].unique()}")

# Exploratory data analysis 

In [None]:
number_fueltype=df['fueltype'].value_counts().reset_index()
number_fueltype

In [None]:
fig = px.pie(number_fueltype, values='count', names='fueltype')
fig.show()

In [None]:
number_aspiration=df['aspiration'].value_counts().reset_index()
number_aspiration

In [None]:
fig = px.pie(number_aspiration, values='count', names='aspiration')
fig.show()

In [None]:
number_doornumber =df['doornumber'].value_counts().reset_index()
number_doornumber 

In [None]:
fig = px.pie(number_doornumber , values='count', names='doornumber')
fig.show()

In [None]:
number_carbody=df['carbody'].value_counts().reset_index()
number_carbody

In [None]:
fig = px.pie(number_carbody, values='count', names='carbody')
fig.show()

In [None]:
number_drivewheel=df['drivewheel'].value_counts().reset_index()
number_drivewheel

In [None]:
fig = px.pie(number_drivewheel, values='count', names='drivewheel')
fig.show()

In [None]:
number_enginelocation=df['enginelocation'].value_counts().reset_index()
number_enginelocation

In [None]:
fig = px.pie(number_enginelocation, values='count', names='enginelocation')
fig.show()

In [None]:
number_enginetype=df['enginetype'].value_counts().reset_index()
number_enginetype

In [None]:
fig = px.pie(number_enginetype, values='count', names='enginetype')
fig.show()

In [None]:
number_cylindernumber=df['cylindernumber'].value_counts().reset_index()
number_cylindernumber

In [None]:
fig = px.pie(number_cylindernumber, values='count', names='cylindernumber')
fig.show()

In [None]:
number_fuelsystem=df['fuelsystem'].value_counts().reset_index()
number_fuelsystem

In [None]:
fig = px.pie(number_fuelsystem, values='count', names='fuelsystem')
fig.show()

In [None]:
numerical_columns = df.select_dtypes(include='number').copy()

In [None]:
numerical_columns.drop('car_ID',axis=1,inplace=True)

In [None]:
plt.figure(figsize=(12, 8))
for feature in numerical_columns.columns.to_list():
    plt.subplot(3, 5, numerical_columns.columns.to_list().index(feature) + 1)
    sns.histplot(data=df[feature], bins=20, kde=True)
    plt.title(feature)
plt.tight_layout()
plt.show()

In [None]:
# top 20 car model
top_car_models = df['CarName'].value_counts().head(20)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_car_models.values, y=top_car_models.index)
plt.title(f'Top 20 Car Models by Frequency')
plt.xlabel('Frequency')
plt.ylabel('Car Model')
plt.tight_layout()
plt.show()

In [None]:
# Calculate average price for each car model
avg_prices_by_car = df.groupby('CarName')['price'].mean().sort_values(ascending=False)

top_car_models = avg_prices_by_car.head(20)

plt.figure(figsize=(10, 6))
sns.barplot(x=top_car_models.values, y=top_car_models.index)
plt.title(f'Top 20 Car Models by Average Price')
plt.xlabel('Average Price')
plt.ylabel('Car Model')
plt.tight_layout()
plt.show()

In [None]:
numerical_fetuer=numerical_columns.columns.to_list()

#### work with outliers

In [None]:
sns.boxplot(data=df, x = "symboling")

In [None]:
sns.boxplot(data=df, x = "wheelbase")

In [None]:
sns.boxplot(data=df, x ='carlength' )

In [None]:
sns.boxplot(data=df, x = 'carwidth')

In [None]:
sns.boxplot(data=df, x = 'carheight')

In [None]:
sns.boxplot(data=df, x = 'curbweight')

In [None]:
sns.boxplot(data=df, x = 'enginesize')

In [None]:
sns.boxplot(data=df, x = 'boreratio')

In [None]:
correlation_matrix = df[numerical_fetuer].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='Blues')
plt.title('Correlation Heatmap')
plt.show()

# data preprocessing

In [None]:
# Extract brand and model from CarName
df['brand'] = df['CarName'].apply(lambda x: x.split(' ')[0])
df['model'] = df['CarName'].apply(lambda x: ' '.join(x.split(' ')[1:]))

In [None]:
# Define categorical and numerical columns
categorical_columns = ['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel',
                       'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem', 'brand', 'model']
numerical_columns = ['wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight',
                     'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower',
                     'peakrpm', 'citympg', 'highwaympg']

In [None]:
label_encoder = LabelEncoder()
for column in categorical_columns:
    df[column] = label_encoder.fit_transform(df[column])

In [None]:
df['power_to_weight_ratio'] = df['horsepower'] / df['curbweight']
for column in numerical_columns:
    df[f'{column}_squared'] = df[column] ** 2
df['log_enginesize'] = np.log(df['enginesize'] + 1)

In [None]:
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# Model

In [None]:
X = df.drop(['price', 'CarName'], axis=1) 
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, y_pred)
r2_square = r2_score(y_test,y_pred)
print(f" R-squared: {r2_square}")
print(f'Mean Squared Error: {mse}')

In [None]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df