In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib
warnings.filterwarnings('ignore')


In [None]:
# Load the dataset
df_car = pd.read_csv("car_price_prediction.csv")


In [None]:
# Display basic information about the dataset
print(df_car.info())

In [None]:
# Display basic information about the dataset
print(df_car.info())

In [None]:
# Display the first few rows of the dataset to understand its structure
df_car.head()

In [None]:
# Check for missing values in the dataset
print(df_car.isna().sum())

In [None]:
# Basic statistics of the dataset
df_car.describe()

In [None]:
# Handle Missing Values
# For categorical columns, fill with mode
categorical_columns = df_car.select_dtypes(include=['object']).columns
for column in categorical_columns:
    mode_val = df_car[column].mode()[0]
    df_car[column].fillna(mode_val, inplace=True)

In [None]:
# For numerical columns, fill with median
numerical_columns = df_car.select_dtypes(include=['int64', 'float64']).columns
for column in numerical_columns:
    median_val = df_car[column].median()
    df_car[column].fillna(median_val, inplace=True)


In [None]:
# Check for duplicates and remove if any
print(df_car.duplicated().sum())
df_car.drop_duplicates(inplace=True)

In [None]:
# Data Visualization
# Distribution plots for numerical features
for column in numerical_columns:
    sns.distplot(df_car[column])
    plt.title(f'Distribution of {column}')
    plt.show()

In [None]:
# Count plot for categorical features
for column in categorical_columns:
    sns.countplot(y=column, data=df_car, palette="flare")
    plt.title(f'Distribution of {column}')
    plt.ylabel(column)
    plt.xlabel('Total')
    plt.show()

In [29]:
# Correlation matrix for numerical features
corr_matrix = df_car.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", center=0)
plt.title('Correlation Heatmap')
plt.show()

ValueError: could not convert string to float: '-'

In [30]:
# Encoding the target variable and categorical features
target_column = 'Price'  # Replace with the actual target column name
if df_car[target_column].dtype == 'object':
    le = LabelEncoder()
    df_car[target_column] = le.fit_transform(df_car[target_column])

for column in categorical_columns:
    df_car[column] = le.fit_transform(df_car[column])


NameError: name 'le' is not defined

In [31]:
# Splitting the data into training and testing sets
X = df_car.drop([target_column], axis=1)
y = df_car[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [32]:
# One-hot encoding for categorical variables (if needed)
X_train = pd.get_dummies(X_train)
X_test = pd.get_dummies(X_test)

# Ensure the number of features is consistent between train and test sets
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [26]:
# Model Training and Evaluation

# Logistic Regression
LRclassifier = LogisticRegression(solver='liblinear', max_iter=5000)
LRclassifier.fit(X_train, y_train)
y_pred_lr = LRclassifier.predict(X_test)
LRAcc = accuracy_score(y_pred_lr, y_test)
print('Logistic Regression accuracy: {:.2f}%'.format(LRAcc * 100))

Logistic Regression accuracy: 1.34%


In [27]:
# K-Nearest Neighbors
KNclassifier = KNeighborsClassifier(n_neighbors=20)
KNclassifier.fit(X_train, y_train)
y_pred_knn = KNclassifier.predict(X_test)
KNAcc = accuracy_score(y_pred_knn, y_test)
print('K-Neighbors accuracy: {:.2f}%'.format(KNAcc * 100))

K-Neighbors accuracy: 3.40%


In [33]:
# Plotting accuracy scores for different K values in KNN
scoreListknn = []
for i in range(1, 30):
    KNclassifier = KNeighborsClassifier(n_neighbors=i)
    KNclassifier.fit(X_train, y_train)
    scoreListknn.append(KNclassifier.score(X_test, y_test))

plt.plot(range(1, 30), scoreListknn)
plt.xticks(np.arange(1, 30, 1))
plt.xlabel("K value")
plt.ylabel("Score")
plt.show()
KNAccMax = max(scoreListknn)
print("KNN Acc Max {:.2f}%".format(KNAccMax * 100))

KeyboardInterrupt: 

In [None]:
# SVM
SVCclassifier = SVC(kernel='linear', max_iter=5000)
SVCclassifier.fit(X_train, y_train)
y_pred_svc = SVCclassifier.predict(X_test)
SVCAcc = accuracy_score(y_pred_svc, y_test)
print('SVM accuracy: {:.2f}%'.format(SVCAcc * 100))

In [None]:
# Decision Tree
DTclassifier = DecisionTreeClassifier(max_leaf_nodes=5)
DTclassifier.fit(X_train, y_train)
y_pred_dt = DTclassifier.predict(X_test)
DTAcc = accuracy_score(y_pred_dt, y_test)
print('Decision Tree accuracy: {:.2f}%'.format(DTAcc * 100))

In [None]:
# Plotting accuracy scores for different max_leaf_nodes in Decision Tree
scoreListDT = []
for i in range(2, 50):
    DTclassifier = DecisionTreeClassifier(max_leaf_nodes=i)
    DTclassifier.fit(X_train, y_train)
    scoreListDT.append(DTclassifier.score(X_test, y_test))

plt.plot(range(2, 50), scoreListDT)
plt.xticks(np.arange(2, 50, 5))
plt.xlabel("Leaf Nodes")
plt.ylabel("Score")
plt.show()
DTAccMax = max(scoreListDT)
print("DT Acc Max {:.2f}%".format(DTAccMax * 100))

In [None]:
# Random Forest
RFclassifier = RandomForestClassifier(max_leaf_nodes=5, n_estimators=100, random_state=1)
RFclassifier.fit(X_train, y_train)
y_pred_rf = RFclassifier.predict(X_test)
RFAcc = accuracy_score(y_pred_rf, y_test)
print('Random Forest accuracy: {:.2f}%'.format(RFAcc * 100))

In [None]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred_lr)
print(cm)
plt.figure(figsize=(4, 4))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square=True, cmap='Blues_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Model Comparison
compare = pd.DataFrame({
    'Model': ['Logistic Regression', 'K-Neighbors', 'SVM', 'Decision Tree', 'Random Forest'],
    'Accuracy': [LRAcc * 100, KNAcc * 100, SVCAcc * 100, DTAcc * 100, RFAcc * 100]
})

compare_sorted = compare.sort_values(by='Accuracy', ascending=False)
print(compare_sorted)

In [None]:
# Save the best model
best_model = RFclassifier if RFAcc >= max(LRAcc, KNAcc, SVCAcc, DTAcc) else DTclassifier
joblib.dump(best_model, 'best_car_price_model.joblib')
print("Best model saved as 'best_car_price_model.joblib'")

![وصف الصورة](C:\Users\USERW\Desktop\السيارة\A..PNG)


from IPython.display import Image, display

# إدراج الصورة من المسار المحلي
display(Image(filenamC:\Users\USERW\Desktop\السيارة\OIP.jpegورة'))

# أو إدراج الصورة من URL
display(Imagehttps://th.bing.com/th/id/OIP.wEZOX2enkx5Q2-A-u1BMHAHaEK?rs=1&pid=ImgDetMainالصورة'))


![وصف ال;;;;صورة](![وصف الJJJJJ](C://Users//USERW//Desktop//السيارة//A..PNG))