In [None]:
# Install all required packages if not already installed
!pip install pandas matplotlib seaborn scikit-learn -q

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [None]:
# Load the dataset
car_df = pd.read_csv('global_cars_enhanced.csv')

# Display first few rows
print('First 5 rows:')
print(car_df.head())

In [None]:
# Check for missing values
print('Missing values:')
print(car_df.isnull().sum())

In [None]:
# Summary statistics
print('Summary statistics:')
print(car_df.describe())

In [None]:
# Visualize Price_Category distribution
plt.figure(figsize=(10, 6))
sns.countplot(x='Price_Category', data=car_df)
plt.title('Distribution of Car Price Category')
plt.show()

In [None]:
# Preprocessing
# Drop Car_ID column (not useful for classification)
car_df = car_df.drop('Car_ID', axis=1)

# Drop Price_USD column (we are predicting Price_Category, so Price would leak the target)
if 'Price_USD' in car_df.columns:
    car_df = car_df.drop('Price_USD', axis=1)

# Encode all remaining categorical (object) columns using LabelEncoder
label_encoders = {}
for col in car_df.select_dtypes(include=['object']).columns:
    if col == 'Price_Category':
        continue  # Skip target column
    le = LabelEncoder()
    car_df[col] = le.fit_transform(car_df[col])
    label_encoders[col] = le
    print(f'Encoded column: {col}')

print('\nDataFrame after encoding:')
print(car_df.head())

In [None]:
# Define features (X) and target (y)
X = car_df.drop('Price_Category', axis=1)
y = car_df['Price_Category']

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f'X_train shape: {X_train_scaled.shape}')
print(f'X_test shape: {X_test_scaled.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'y_test shape: {y_test.shape}')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Create and train Logistic Regression model
# Using 'lbfgs' solver with increased max_iter for convergence
log_reg = LogisticRegression(
    multi_class='multinomial',  # For multi-class classification
    solver='lbfgs',
    max_iter=1000,
    random_state=42
)
# Fit the model on training data
log_reg.fit(X_train_scaled, y_train)

In [None]:
# Make predictions on test data
y_pred = log_reg.predict(X_test_scaled)

In [None]:
# Evaluate the model
print("="*50)
print("LOGISTIC REGRESSION CLASSIFICATION RESULTS")
print("="*50)
# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# Confusion Matrix
print("\nConfusion Matrix:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

In [None]:
# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=log_reg.classes_,
            yticklabels=log_reg.classes_)
plt.title('Confusion Matrix - Logistic Regression')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Get prediction probabilities
y_pred_proba = log_reg.predict_proba(X_test_scaled)
print("\nSample prediction probabilities (first 5):")
print(y_pred_proba[:5])
# Feature importance (coefficients)
print("\nFeature Coefficients (Importance):")
feature_names = X.columns.tolist()
for i, class_name in enumerate(log_reg.classes_):
    print(f"\n{str(class_name).upper()} class:")
    for feature, coef in sorted(zip(feature_names, log_reg.coef_[i]),
                                 key=lambda x: abs(x[1]), reverse=True):
        print(f"  {feature}: {coef:.4f}")

In [None]:
# Import Decision Tree Classifier and evaluation metrics
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Create and train Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(
    criterion='gini',      # Split criterion: 'gini' or 'entropy'
    max_depth=10,          # Maximum depth of the tree (prevents overfitting)
    min_samples_split=10,  # Minimum samples required to split a node
    min_samples_leaf=5,    # Minimum samples required at a leaf node
    random_state=42
)
# Fit the model on training data
dt_classifier.fit(X_train_scaled, y_train)
# Make predictions on test data
y_pred_dt = dt_classifier.predict(X_test_scaled)

In [None]:
# Evaluate the Decision Tree model
print("="*50)
print("DECISION TREE CLASSIFICATION RESULTS")
print("="*50)

accuracy_dt = accuracy_score(y_test, y_pred_dt)
print(f"\nAccuracy: {accuracy_dt:.4f} ({accuracy_dt*100:.2f}%)")
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))

print("\nConfusion Matrix:")
cm_dt = confusion_matrix(y_test, y_pred_dt)
print(cm_dt)

# Visualize Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm_dt, annot=True, fmt='d', cmap='Greens',
            xticklabels=dt_classifier.classes_,
            yticklabels=dt_classifier.classes_)
plt.title('Confusion Matrix - Decision Tree')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Visualize the Decision Tree (limited depth for readability)
plt.figure(figsize=(30, 15))
plot_tree(dt_classifier,
          feature_names=feature_names,
          class_names=[str(c) for c in dt_classifier.classes_],
          filled=True,
          rounded=True,
          max_depth=3,
          fontsize=10)
plt.title('Decision Tree Visualization (max_depth=3)')
plt.tight_layout()
plt.show()

In [None]:
# Feature importance from Decision Tree
print("\nFeature Importance (Decision Tree):")
for feature, importance in sorted(zip(feature_names, dt_classifier.feature_importances_),
                                   key=lambda x: x[1], reverse=True):
    print(f"  {feature}: {importance:.4f}")

# Visualize Feature Importances
importances = dt_classifier.feature_importances_
indices = importances.argsort()[::-1]

plt.figure(figsize=(12, 6))
plt.title('Feature Importances - Decision Tree')
plt.bar(range(len(feature_names)), importances[indices], align='center')
plt.xticks(range(len(feature_names)), [feature_names[i] for i in indices], rotation=45, ha='right')
plt.tight_layout()
plt.show()

In [None]:
# Model Comparison Summary
print("="*50)
print("MODEL COMPARISON SUMMARY")
print("="*50)
print(f"\nLogistic Regression Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Decision Tree Accuracy:       {accuracy_dt:.4f} ({accuracy_dt*100:.2f}%)")

if accuracy > accuracy_dt:
    print("\n=> Logistic Regression performs better on this dataset.")
elif accuracy_dt > accuracy:
    print("\n=> Decision Tree performs better on this dataset.")
else:
    print("\n=> Both models perform equally on this dataset.")