In [12]:
# Import library yang dibutuhkan
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, confusion_matrix

# Set pandas display options
pd.set_option('display.max_columns', None)

# Set random seed untuk reproducibility
np.random.seed(42)

In [13]:
# Download the dataset
!wget https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data

--2025-06-24 19:22:39--  https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘car.data.1’

car.data.1              [ <=>                ]  50.65K  --.-KB/s    in 0.1s    

2025-06-24 19:22:39 (525 KB/s) - ‘car.data.1’ saved [51867]



In [14]:
# ===============================================================
# 1. LOAD DATA DAN EXPLORATORY DATA ANALYSIS (EDA)
# ===============================================================
print("1. Loading data and performing EDA...")

# Load dataset
column_names = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'class']

# Load dataset (adjust path as needed)
df = pd.read_csv('car.data', names=column_names, skipinitialspace=True, na_values="?")

# Dataset Overview
print("Dataset Overview:")
print(f"Shape: {df.shape}")
print(df.info())

# Descriptive Statistics
print("\nDescriptive Statistics:")
print(df.describe())

# Categorical Statistics
print("\nCategorical Statistics:")
print(df.describe(include=['object']))

# Check Missing Values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

print("\nMissing Values:")
print(missing_values)
print("\nMissing Percentage:")
print(missing_percentage)

# Target Distribution
print("\nTarget Distribution:")
print(df['class'].value_counts(normalize=True))

# The original data visualization is moved to after encoding

1. Loading data and performing EDA...
Dataset Overview:
Shape: (1728, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB
None

Descriptive Statistics:
       buying  maint doors persons lug_boot safety  class
count    1728   1728  1728    1728     1728   1728   1728
unique      4      4     4       3        3      3      4
top     vhigh  vhigh     2       2    small    low  unacc
freq      432    432   432     576      576    576   1210

Categorical Statistics:
       buying  maint doors persons lug_boot safety  class
count    1728   1728  1728    1728 

In [15]:
# ===============================================================
# 2. VISUALISASI DATA
# ===============================================================
print("\n2. Data Visualization...")

# Bar plot for categorical variables (all features are treated as categorical after encoding)
categorical_features = df.columns # All columns are categorical after encoding
for feature in categorical_features:
    plt.figure(figsize=(10, 5))
    df[feature].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {feature}')
    plt.ylabel('Count')
    plt.xlabel(feature)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'distribution_{feature}.png')
    plt.close()

# Correlation matrix is now handled after encoding in step 5


2. Data Visualization...


In [16]:
# ===============================================================
# 3. DATA CLEANING
# ===============================================================
print("\n3. Data Cleaning...")

# Handle missing values
for column in df.columns:
    if df[column].dtype == 'object':
        # For categorical columns, fill with mode
        df[column].fillna(df[column].mode()[0], inplace=True)
    else:
        # For numeric columns, fill with median
        df[column].fillna(df[column].median(), inplace=True)

# Check for duplicates
duplicate_count = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_count}")

# Remove duplicates
df.drop_duplicates(inplace=True)
print(f"Number of rows after removing duplicates: {len(df)}")


3. Data Cleaning...
Number of duplicate rows: 0
Number of rows after removing duplicates: 1728


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[column].fillna(df[column].mode()[0], inplace=True)


In [17]:
# ===============================================================
# 4. FEATURE ENGINEERING & ENCODING
# ===============================================================
print("\n4. Feature Engineering & Encoding...")

# Store encoding maps
encoding_maps = {}

# Encoding for 'buying'
buying = {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1}
df['buying'] = df['buying'].map(buying)
encoding_maps['buying'] = buying

# Encoding for 'maint'
maint = {'vhigh': 4, 'high': 3, 'med': 2, 'low': 1}
df['maint'] = df['maint'].map(maint)
encoding_maps['maint'] = maint

# Encoding for 'doors'
doors = {'2': 2, '3': 3, '4': 4, '5more': 5}
df['doors'] = df['doors'].map(doors)
encoding_maps['doors'] = doors

# Encoding for 'persons'
persons = {'2': 2, '4': 4, 'more': 5}
df['persons'] = df['persons'].map(persons)
encoding_maps['persons'] = persons

# Encoding for 'lug_boot'
lug_boot = {'small': 1, 'med': 2, 'big': 3}
df['lug_boot'] = df['lug_boot'].map(lug_boot)
encoding_maps['lug_boot'] = lug_boot

# Encoding for 'safety'
safety = {'low': 1, 'med': 2, 'high': 3}
df['safety'] = df['safety'].map(safety)
encoding_maps['safety'] = safety

# Encoding for 'class' (target)
class_ = {'unacc': 1, 'acc': 2, 'good': 3, 'vgood': 4}
df['class'] = df['class'].map(class_)
encoding_maps['class'] = class_


4. Feature Engineering & Encoding...


In [18]:
# ===============================================================
# 5. CHECKING CORRELATIONS AFTER ENCODING
# ===============================================================
print("\n5. Checking correlations after encoding...")

# Create correlation heatmap
correlation = df.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(correlation.round(2),
           annot=True,
           vmax=1,
           square=True,
           cmap='RdYlGn_r')
plt.title('Correlation Matrix After Encoding')
plt.tight_layout()
plt.savefig('correlation_after_encoding.png')
plt.close()


5. Checking correlations after encoding...


In [19]:
# ===============================================================
# 6. FEATURE SELECTION
# ===============================================================
print("\n6. Feature Selection...")

# Remove constant features
df = df.loc[:, df.apply(pd.Series.nunique) != 1]

# Find and remove highly correlated features
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
    return col_corr

data_tanpa_fitur = df.drop('class', axis=1)
corr_features = correlation(data_tanpa_fitur, 0.8)
print('Correlated features: ', len(set(corr_features)))
print(corr_features)

# Remove highly correlated features
df.drop(labels=corr_features, axis=1, inplace=True)

# Final correlation check
correlation = df.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(correlation.round(2),
           annot=True,
           vmax=1,
           square=True,
           cmap='RdYlGn_r')
plt.title('Final Correlation Matrix')
plt.tight_layout()
plt.savefig('final_correlation.png')
plt.close()


6. Feature Selection...
Correlated features:  0
set()


In [20]:
# ===============================================================
# 7. MODEL TRAINING
# ===============================================================
print("\n7. Model Training...")

# Split data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Define parameter grid for GridSearchCV
param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

# Create decision tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(
    estimator=dt_classifier,
    param_grid=param_grid,
    cv=3,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Fit the grid search to the data
grid_search.fit(X_train, y_train)

# Get the best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print(f"Best Parameters: {best_params}")


7. Model Training...
Training set size: 1555 samples
Testing set size: 173 samples
Fitting 3 folds for each of 90 candidates, totalling 270 fits
Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [21]:
# ===============================================================
# 8. MODEL EVALUATION
# ===============================================================
print("\n8. Model Evaluation...")

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")

# Generate classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Generate confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

# Plot confusion matrix
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
           xticklabels=sorted(df['class'].unique()),
           yticklabels=sorted(df['class'].unique()))
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.close()

# Visualization of the Decision Tree
plt.figure(figsize=(40, 30))
plot_tree(best_model,
         feature_names=X.columns,
         class_names=[str(i) for i in sorted(df['class'].unique())],
         filled=True,
         rounded=True,
         max_depth=3)  # Limiting depth for better visualization
plt.title('Decision Tree Visualization (Limited to Depth 3)')
plt.tight_layout()
plt.savefig('decision_tree.png')
plt.close()

# Plot feature importance
feature_importance = pd.DataFrame({
    'Feature': X.columns,
    'Importance': best_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance)
plt.title('Feature Importance')
plt.tight_layout()
plt.savefig('feature_importance.png')
plt.close()

# Check performance on training data
ori_y_pred_dt_train = best_model.predict(X_train)

ori_accuracy_dt_train = accuracy_score(y_train, ori_y_pred_dt_train)
print('Akurasi pada training set: ', ori_accuracy_dt_train)

ori_precision_dt_train = precision_score(y_train, ori_y_pred_dt_train, average='micro')
print('Precision pada training set: ', ori_precision_dt_train)

ori_recall_dt_train = recall_score(y_train, ori_y_pred_dt_train, average='micro')
print('Recall pada training set: ', ori_recall_dt_train)

# Recheck performance on testing data
ori_accuracy_dt_test = accuracy_score(y_test, y_pred)
print('Akurasi pada test set: ', ori_accuracy_dt_test)

ori_precision_dt_test = precision_score(y_test, y_pred, average='micro')
print('Precision pada test set: ', ori_precision_dt_test)

ori_recall_dt_test = recall_score(y_test, y_pred, average='micro')
print('Recall pada test set: ', ori_recall_dt_test)



8. Model Evaluation...
Accuracy: 0.9827

Classification Report:
              precision    recall  f1-score   support

           1       0.99      1.00      1.00       121
           2       0.97      0.95      0.96        38
           3       0.86      0.86      0.86         7
           4       1.00      1.00      1.00         7

    accuracy                           0.98       173
   macro avg       0.96      0.95      0.95       173
weighted avg       0.98      0.98      0.98       173


Confusion Matrix:
[[121   0   0   0]
 [  1  36   1   0]
 [  0   1   6   0]
 [  0   0   0   7]]
Akurasi pada training set:  0.9961414790996784
Precision pada training set:  0.9961414790996784
Recall pada training set:  0.9961414790996784
Akurasi pada test set:  0.9826589595375722
Precision pada test set:  0.9826589595375722
Recall pada test set:  0.9826589595375722


In [22]:
# ===============================================================
# 9. CREATE MODEL COMPONENTS FOR SAVING
# ===============================================================
print("\n9. Creating and saving model components...")

# Create a dictionary of components
model_components = {
    'model': best_model,
    'feature_names': X.columns.tolist(),
    'encoding_maps': encoding_maps,
    'model_params': best_params,
    'removed_features': list(corr_features) if len(corr_features) > 0 else [],
    'target_map': class_
}

# Save model components
joblib.dump(model_components, 'car_evaluation_prediction_components.joblib')
print("Model components saved successfully as 'car_evaluation_prediction_components.joblib'")

# Function to predict class (for testing)
def predict_class(data, model_components):
    """
    Make class predictions using the trained model

    Parameters:
    -----------
    data : dict or DataFrame
        Data with features for prediction
    model_components : dict
        Dictionary containing model and preprocessing information

    Returns:
    --------
    prediction : int
        Predicted class as integer (1: 'unacc', 2: 'acc', 3: 'good', 4: 'vgood')
    prediction_label : str
        Original class label
    probability : float
        Probability of the predicted class
    """
    import pandas as pd

    # Convert single record to DataFrame if needed
    if isinstance(data, dict):
        data = pd.DataFrame([data])

    # Get components
    model = model_components['model']
    encoding_maps = model_components['encoding_maps']
    feature_names = model_components['feature_names']

    # Apply encoding to categorical features
    for col in data.columns:
        if col in encoding_maps and col != 'class':
            data[col] = data[col].map(encoding_maps[col])

    # Ensure we only use features that the model was trained on
    data_for_pred = data[feature_names].copy()

    # Make prediction
    prediction = model.predict(data_for_pred)[0]
    probabilities = model.predict_proba(data_for_pred)[0]

    # Get inverse mapping for class
    class_map_inverse = {v: k for k, v in encoding_maps['class'].items()}
    prediction_label = class_map_inverse[prediction]

    # Mapping to integer values as specified
    class_mapping = {'unacc': 1, 'acc': 2, 'good': 3, 'vgood': 4}
    prediction_int = class_mapping[prediction_label]

    return {
        'prediction': prediction_int,
        'prediction_label': prediction_label,
        'probability': probabilities[prediction]
    }

# Test the prediction function with a sample
test_sample = X_test.iloc[0].to_dict()
loaded_components = joblib.load('car_evaluation_prediction_components.joblib')
prediction_result = predict_class(test_sample, loaded_components)

print("\nTest prediction result:")
print(prediction_result)
print("Actual value:", y_test.iloc[0])


9. Creating and saving model components...
Model components saved successfully as 'car_evaluation_prediction_components.joblib'

Test prediction result:
{'prediction': 2, 'prediction_label': 'acc', 'probability': np.float64(0.0)}
Actual value: 1
