In [7]:
import pandas as pd

# قراءة البيانات
df = pd.read_csv("BMW_Car_Sales_Classification.csv")

# نظرة عامة على البيانات
print(df.head())
print("\nشكل البيانات:", df.shape)
print("\nملخص الأعمدة:")
print(df.info())
print("\nالقيم المفقودة:")
print(df.isnull().sum())
print("\nالوصف الإحصائي:")
print(df.describe())


      Model  Year         Region  Color Fuel_Type Transmission  Engine_Size_L  \
0  5 Series  2016           Asia    Red    Petrol       Manual            3.5   
1        i8  2013  North America    Red    Hybrid    Automatic            1.6   
2  5 Series  2022  North America   Blue    Petrol    Automatic            4.5   
3        X3  2024    Middle East   Blue    Petrol    Automatic            1.7   
4  7 Series  2020  South America  Black    Diesel       Manual            2.1   

   Mileage_KM  Price_USD  Sales_Volume Sales_Classification  
0      151748      98740          8300                 High  
1      121671      79219          3428                  Low  
2       10991     113265          6994                  Low  
3       27255      60971          4047                  Low  
4      122131      49898          3080                  Low  

شكل البيانات: (50000, 11)

ملخص الأعمدة:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 

1. فصل الهدف (Target) عن الميزات (Features)

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# فصل المتغير المستهدف
X = df.drop("Sales_Classification", axis=1)
y = df["Sales_Classification"]

# تحويل الهدف إلى أرقام (High=1, Low=0)
le = LabelEncoder()
y = le.fit_transform(y)


2. تحويل الأعمدة النصية إلى أرقام (OneHotEncoding)[link text](https://)

> Add blockquote



In [9]:
X = pd.get_dummies(X, drop_first=True)


3. تقسيم البيانات (Train/Test)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(df['Sales_Classification'])
print(list(zip(df['Sales_Classification'][:10], y_encoded[:10])))


[('High', np.int64(0)), ('Low', np.int64(1)), ('Low', np.int64(1)), ('Low', np.int64(1)), ('Low', np.int64(1)), ('Low', np.int64(1)), ('High', np.int64(0)), ('Low', np.int64(1)), ('High', np.int64(0)), ('Low', np.int64(1))]


In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(df['Sales_Classification'])

# نوري كل فئة ورقمها
for original, encoded in zip(df['Sales_Classification'][:10], y_encoded[:10]):
    print(original, encoded)


High 0
Low 1
Low 1
Low 1
Low 1
Low 1
High 0
Low 1
High 0
Low 1


In [13]:
y_encoded = 1 - y_encoded


In [14]:
y_encoded = df['Sales_Classification'].map({'Low': 0, 'High': 1})


In [15]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      3032
           1       1.00      1.00      1.00      6968

    accuracy                           1.00     10000
   macro avg       1.00      1.00      1.00     10000
weighted avg       1.00      1.00      1.00     10000



In [16]:
# شجرة بحد أقصى للعمق
dt_shallow = DecisionTreeClassifier(max_depth=5, random_state=42)
dt_shallow.fit(X_train, y_train)

# تقييم
train_acc_shallow = dt_shallow.score(X_train, y_train)
test_acc_shallow = dt_shallow.score(X_test, y_test)

print("Train Accuracy (shallow):", train_acc_shallow)
print("Test Accuracy (shallow):", test_acc_shallow)


Train Accuracy (shallow): 1.0
Test Accuracy (shallow): 1.0


In [18]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
rf.fit(X_train, y_train)

train_acc_rf = rf.score(X_train, y_train)
test_acc_rf = rf.score(X_test, y_test)

print("Random Forest Train Accuracy:", train_acc_rf)
print("Random Forest Test Accuracy:", test_acc_rf)


Random Forest Train Accuracy: 1.0
Random Forest Test Accuracy: 1.0


In [19]:
# مثال بيانات جديدة
import pandas as pd

new_data = pd.DataFrame({
    'Model': ['X5', 'i3'],
    'Year': [2025, 2023],
    'Region': ['Europe', 'Asia'],
    'Color': ['Green', 'White'],
    'Fuel_Type': ['Electric', 'Hybrid'],
    'Transmission': ['Automatic', 'Manual'],
    'Engine_Size_L': [2.0, 1.5],
    'Mileage_KM': [5000, 2000],
    'Price_USD': [90000, 60000],
    'Sales_Volume': [4000, 2500]
})

# لازم نفس خطوات تحويل النصوص إلى أرقام (OneHotEncoding)
new_data_encoded = pd.get_dummies(new_data, drop_first=True)

# ممكن تحتاج تضبط الأعمدة لتكون متطابقة مع X_train
missing_cols = set(X_train.columns) - set(new_data_encoded.columns)
for c in missing_cols:
    new_data_encoded[c] = 0
new_data_encoded = new_data_encoded[X_train.columns]

# توقع
y_new_pred = rf.predict(new_data_encoded)
print("Predictions on new data:", y_new_pred)


Predictions on new data: [1 1]
