In [1]:
import pandas as pd 
import numpy as np 
import plotly.express as px 
from sklearn.feature_selection import SelectKBest, f_classif
# Linear Models
from sklearn.linear_model import LogisticRegression
# Tree-Based
from sklearn.tree import *
from sklearn.ensemble import *
# KNN
from sklearn.neighbors import KNeighborsClassifier

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
# SVM
from sklearn.svm import SVC
# xgboost
from xgboost import *
# cat boost 
from catboost import *
# lightgbm
from lightgbm import *
from sklearn.model_selection import cross_validate
# load data 
df = pd.read_csv("BMW_sales_messy.csv") 

### Understanding Data

| العمود                   | نوعه                        | وصفه                                                        |
| ------------------------ | --------------------------- | ----------------------------------------------------------- |
| **Model**                | Categorical (Nominal)       | اسم موديل السيارة، مثال: Corolla, Civic                     |
| **Year**                 | Numerical (Discrete)        | سنة تصنيع السيارة، مثال: 2018, 2020                         |
| **Region**               | Categorical (Nominal)       | المنطقة أو الدولة اللي تباع فيها السيارة، مثال: USA, Europe |
| **Color**                | Categorical (Nominal)       | لون السيارة، مثال: Red, Black                               |
| **Fuel_Type**            | Categorical (Nominal)       | نوع الوقود، مثال: Petrol, Diesel, Electric                  |
| **Transmission**         | Categorical (Nominal)       | نوع ناقل الحركة، مثال: Manual, Automatic                    |
| **Engine_Size_L**        | Numerical (Continuous)      | حجم الموتور باللتر، مثال: 1.6, 2.0                          |
| **Mileage_KM**           | Numerical (Continuous)      | عدد الكيلومترات اللي مشت السيارة، مثال: 50000, 120000       |
| **Price_USD**            | Numerical (Continuous)      | سعر السيارة بالدولار، مثال: 20000, 35000                    |
| **Sales_Volume**         | Numerical (Discrete)        | عدد السيارات المباعة، مثال: 150, 300                        |
| **Sales_Classification** | Categorical (Ordinal/Label) | تصنيف المبيعات حسب الحجم أو الأداء، مثال: Low, Medium, High |


In [2]:
df

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748.0,98740.0,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671.0,79219.0,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991.0,113265.0,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255.0,60971.0,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131.0,49898.0,3080,Low
...,...,...,...,...,...,...,...,...,...,...,...
50195,M3,2021,North America,Black,Diesel,Automatic,4.5,,107117.0,9161,High
50196,X1,2014,North America,Grey,Hybrid,Manual,4.2,73437.0,45207.0,2739,Low
50197,X6,2018,Europe,Blue,Diesel,Manual,2.2,92050.0,73600.0,9761,High
50198,7 Series,2024,Asia,Black,Diesel,Automatic,2.4,41711.0,50469.0,8145,High


### Exploratory Data Analysis

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50200 entries, 0 to 50199
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Model                 50200 non-null  object 
 1   Year                  50200 non-null  int64  
 2   Region                50200 non-null  object 
 3   Color                 50200 non-null  object 
 4   Fuel_Type             50200 non-null  object 
 5   Transmission          50200 non-null  object 
 6   Engine_Size_L         48695 non-null  float64
 7   Mileage_KM            48696 non-null  float64
 8   Price_USD             48697 non-null  float64
 9   Sales_Volume          50200 non-null  int64  
 10  Sales_Classification  50200 non-null  object 
dtypes: float64(3), int64(2), object(6)
memory usage: 4.2+ MB


In [4]:
df.head()

Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748.0,98740.0,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671.0,79219.0,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991.0,113265.0,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255.0,60971.0,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131.0,49898.0,3080,Low


In [5]:
# statistics Numerical data 
df.describe()

Unnamed: 0,Year,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume
count,50200.0,48695.0,48696.0,48697.0,50200.0
mean,2017.016335,3.246958,101330.520474,76187.28,5067.17492
std,4.324905,1.008903,64673.462647,44421.6,2856.97781
min,2010.0,1.5,3.0,30000.0,100.0
25%,2013.0,2.4,50318.25,52466.0,2587.0
50%,2017.0,3.2,100713.0,75056.0,5087.0
75%,2021.0,4.1,150914.5,97706.0,7537.0
max,2024.0,5.0,999980.0,1199980.0,9999.0


In [6]:
# statistics Categorical data 
df.describe(include="object")

Unnamed: 0,Model,Region,Color,Fuel_Type,Transmission,Sales_Classification
count,50200,50200,50200,50200,50200,50200
unique,11,6,7,5,2,2
top,7 Series,Asia,Red,Hybrid,Manual,Low
freq,4683,8487,8486,12758,25269,34896


In [7]:
# check duplicated
df.duplicated().sum()

np.int64(200)

In [8]:
# check missing values 
df.isna().sum()

Model                      0
Year                       0
Region                     0
Color                      0
Fuel_Type                  0
Transmission               0
Engine_Size_L           1505
Mileage_KM              1504
Price_USD               1503
Sales_Volume               0
Sales_Classification       0
dtype: int64

### Data Cleaning 

In [9]:
# Convert column names to lowercase letters
df.columns  = df.columns.str.lower()

In [10]:
# drop duplicated
df.drop_duplicates(inplace=True)

In [11]:
# check duplicated
df.duplicated().sum()

np.int64(0)

In [12]:
df["age_car"] = 2024- df["year"] 
df.drop("year" ,axis=1 , inplace=True)

In [13]:
cat_col =['model', 'region', 'color', 'fuel_type', 'transmission',  'sales_classification']
cat_col

['model',
 'region',
 'color',
 'fuel_type',
 'transmission',
 'sales_classification']

In [14]:
num_col =['age_car', 'engine_size_l', 'mileage_km', 'price_usd', 'sales_volume']
num_col

['age_car', 'engine_size_l', 'mileage_km', 'price_usd', 'sales_volume']

In [15]:
for _ in cat_col:
    print(_)
    print(df[_].nunique())
    print(pd.unique(df[_]))

model
11
['5 Series' 'i8' 'X3' '7 Series' 'M5' '3 Series' 'X1' 'M3' 'X5' 'i3' 'X6']
region
6
['Asia' 'North America' 'Middle East' 'South America' 'Europe' 'Africa']
color
7
['Red' 'Blue' 'Black' 'Silver' 'White' 'Grey' 'red']
fuel_type
5
['Petrol' 'Hybrid' 'Diesel' 'Electric' 'Gasoline']
transmission
2
['Manual' 'Automatic']
sales_classification
2
['High' 'Low']


In [16]:
df["color"] = df.color.str.lower()

In [17]:
for _ in cat_col:
    print(_)
    print(df[_].nunique())
    print(pd.unique(df[_]))

model
11
['5 Series' 'i8' 'X3' '7 Series' 'M5' '3 Series' 'X1' 'M3' 'X5' 'i3' 'X6']
region
6
['Asia' 'North America' 'Middle East' 'South America' 'Europe' 'Africa']
color
6
['red' 'blue' 'black' 'silver' 'white' 'grey']
fuel_type
5
['Petrol' 'Hybrid' 'Diesel' 'Electric' 'Gasoline']
transmission
2
['Manual' 'Automatic']
sales_classification
2
['High' 'Low']


In [18]:
for _ in num_col:
    print(px.histogram(df[_]).show())

None


None


None


None


None


In [19]:
# Clean outliers in Mileage_KM
def Clean_outliers(x):
    if x < 500000:
        return x
    else:
        return np.nan 
df["mileage_km"] =df["mileage_km"].apply(Clean_outliers)
df["price_usd"] =df["price_usd"].apply(Clean_outliers)

In [20]:
# Clean outliers in Mileage_KM
# df[df["Mileage_KM"] < 500000]

In [21]:
px.histogram(df["mileage_km"])

In [22]:
px.histogram(df["price_usd"])

In [23]:
# Check class imbalance in Sales_Classification
px.histogram(df["sales_classification"])

In [24]:
# save data cleaning in csv 
df.to_csv("cleaning_data.csv", index=False)

In [25]:
# Split data into features (X) and target (y)
X = df.drop("sales_classification", axis=1) 
y = df["sales_classification"] 

### Pipeline

In [26]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoder = le.fit_transform(y)

In [27]:
y_encoder

array([0, 1, 1, ..., 0, 0, 1])

In [28]:
le.classes_

array(['High', 'Low'], dtype=object)

In [29]:
# Identify numeric columns with missing values
num_cols = ['sales_volume', 'mileage_km', 'price_usd' , "age_car" ,"engine_size_l" ]

In [30]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  RobustScaler

# Fill missing values in numeric columns using SimpleImputer

imputer = SimpleImputer(strategy="median")
scaler = RobustScaler()

num_pipeline1 = Pipeline(steps=[("Imputer"  , imputer ) , ("scaler" ,scaler )])
num_pipeline1

In [31]:
model = ["model"] # > 11 

In [32]:
# Import BinaryEncoder from category_encoders
from category_encoders import BinaryEncoder
from sklearn.pipeline import Pipeline

binary = BinaryEncoder()

# Create a pipeline with BinaryEncoder
model_pipeline2 = Pipeline(steps=[
    ("binary_encoder", binary)  # Encode categorical variables into binary format
])
model_pipeline2

In [33]:
all_cat = ['region','color','fuel_type','transmission']

In [34]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

all_cat_pipeline3 = Pipeline(steps=[("One Hot Encoder" ,ohe )])
all_cat_pipeline3

In [35]:
from sklearn.compose import ColumnTransformer

# ColumnTransformer to preprocess different types of columns
preprocessing = ColumnTransformer(
    transformers=[
        # Numeric columns: fill missing values and scale
        ("num_pipeline", num_pipeline1, num_cols),

        # Binary encoding for selected categorical columns
        ("binary_encoder", model_pipeline2, model),

        # One-Hot encoding for all other categorical columns
        ("onehot_encoder", all_cat_pipeline3, all_cat)
    ],
    remainder='passthrough'  # keep any other columns unchanged
)

# Preview the preprocessing object
preprocessing


### KNN 

In [36]:
from imblearn.pipeline import Pipeline 
# SMOTE

# Pipeline: Preprocessing -> SMOTE -> Feature Selection -> Model

from imblearn.over_sampling import SMOTE
module_KNN  = Pipeline(steps=[("preprocessing", preprocessing) ,  # Apply numeric + categorical transformations
                         ( "SMOTE" , SMOTE()) ,   # Handle class imbalance by oversampling minority class
                         ("models" , KNeighborsClassifier(n_neighbors=31))]) # Train logistic regression
module_KNN

In [37]:
module_KNN.fit(X , y_encoder)


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



In [38]:
import joblib 
joblib.dump(module_KNN, "module_KNN.pkl")

['module_KNN.pkl']

In [39]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     "models__n_neighbors": [ 1, 3, 5, 9, 15 , 23 , 25,27,29 , 31 ]
# }

# gsc = GridSearchCV(
#     estimator=module,
#     param_grid=param_grid,
#     scoring="f1",  
#     cv=5,
#     return_train_score=True,
#     n_jobs=-1
# )

# gsc.fit(X, y)


In [40]:
# gsc.best_params_

In [41]:
# gsc.best_score_

### LogisticRegression

In [42]:
from imblearn.pipeline import Pipeline 
# SMOTE

# Pipeline: Preprocessing -> SMOTE -> Feature Selection -> Model

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
module_l_r  = Pipeline(steps=[("preprocessing", preprocessing) ,  # Apply numeric + categorical transformations
                         ( "SMOTE" , SMOTE()) ,   # Handle class imbalance by oversampling minority class
                         ("models" ,  LogisticRegression(max_iter=1000))]) # Train logistic regression
module_l_r 

In [43]:
module_l_r.fit(X , y_encoder)


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



In [44]:
import joblib 
joblib.dump(module_l_r, "module_l_r.pkl")

['module_l_r.pkl']

### SVC

In [45]:
from imblearn.pipeline import Pipeline 
# SMOTE

# Pipeline: Preprocessing -> SMOTE -> Feature Selection -> Model

from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
module_SVC  = Pipeline(steps=[("preprocessing", preprocessing) ,  # Apply numeric + categorical transformations
                         ( "SMOTE" , SMOTE()) ,   # Handle class imbalance by oversampling minority class
                         ("models" ,  SVC( probability=True, kernel='rbf',random_state=42 ))]) # Train logistic regression
module_SVC

In [None]:
module_SVC.fit(X , y_encoder)


`BaseEstimator._validate_data` is deprecated in 1.6 and will be removed in 1.7. Use `sklearn.utils.validation.validate_data` instead. This function becomes public and is part of the scikit-learn developer API.



In [None]:
import joblib 
joblib.dump(module_SVC, "module_SVC.pkl")

['module_SVC.pkl']

In [None]:

models = [
    ("LogisticRegression", LogisticRegression(max_iter=1000)),
    ("DecisionTree", DecisionTreeClassifier(random_state=42)),
    ("RandomForest", RandomForestClassifier(
        n_estimators=300,
        random_state=42
    )),
    ("KNN", KNeighborsClassifier(n_neighbors=29)),
    ("GaussianNB", GaussianNB()),
    ("SVM", SVC(
        probability=True,
        kernel='rbf',
        random_state=42
    )),
    ("XGBoost", XGBClassifier(
        n_estimators=300,
        max_depth=6,
        learning_rate=0.05,
        eval_metric='logloss',
        random_state=42
    )),
    ("CatBoost", CatBoostClassifier(
        iterations=500,
        depth=6,
        learning_rate=0.05,
        loss_function='Logloss',
        random_state=42,
        verbose=0
    )),
    ("LightGBM", LGBMClassifier(
        n_estimators=500,
        learning_rate=0.05,
        random_state=42
    ))
]


for _ in models:
    module = Pipeline(steps=[("preprocessing", preprocessing) ,( "SMOTE" , SMOTE()) , ("models" , _[1] )])
    r = cross_validate(module , X , y_encoder ,cv=5 ,scoring="f1", return_train_score=True , n_jobs=-1)
    print(_[1])
    print("test_score f1",r["test_score"] * 100 )
    print("train_score f1",r["train_score"] * 100 )
    print("-* 100")

LogisticRegression(max_iter=1000)
test_score f1 [99.56647399 99.733141   99.6897323  99.71148298 99.79818365]
train_score f1 [99.72952505 99.71866546 99.71323967 99.70781329 99.74217975]
-* 100
DecisionTreeClassifier(random_state=42)
test_score f1 [100. 100. 100. 100. 100.]
train_score f1 [100. 100. 100. 100. 100.]
-* 100
RandomForestClassifier(n_estimators=300, random_state=42)
test_score f1 [100. 100. 100. 100. 100.]
train_score f1 [100. 100. 100. 100. 100.]
-* 100
KNeighborsClassifier(n_neighbors=29)
test_score f1 [89.34082456 89.09292386 89.67091797 89.49440857 89.36472819]
train_score f1 [90.7229411  90.50316561 90.469746   90.67909454 90.53143126]
-* 100
GaussianNB()
test_score f1 [97.01459367 96.03050011 97.19861039 96.09205709 96.77850356]
train_score f1 [97.30278583 95.94612291 96.98485017 96.40555742 96.53081345]
-* 100
SVC(probability=True, random_state=42)
test_score f1 [99.13194444 99.30765902 99.17653857 99.14752203 99.23476754]
train_score f1 [99.64461602 99.63928868 99.

{
  "model": "5 Series",
  "region": "Asia",
  "color": "Red",
  "fuel_type": "Petrol",
  "transmission": "Manual",
  "engine_size_l": 3.5,
  "mileage_km": 151748.0,
  "sales_volume": 8300,
  "age_car": 8,
  "price_usd": 98740.0
}


In [2]:
import joblib
from  sklearn.metrics import classification_report
x= joblib.load("module_SVC.pkl")

In [None]:

classification_report

<function sklearn.metrics._classification.classification_report(y_true, y_pred, *, labels=None, target_names=None, sample_weight=None, digits=2, output_dict=False, zero_division='warn')>