In [19]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Path to the dataset
car_path = "car.data"

# Load the dataset
data = pd.read_csv(car_path, header=None)

# Assigning column names based on typical Car Evaluation Dataset
data.columns = [
    "buying_price", "maint_cost", "doors", "persons", "lug_boot", "safety", "class_label"
]

# Initialize LabelEncoder
encoder = LabelEncoder()

# Encode all columns in the dataset
encoded_data = data.apply(encoder.fit_transform)

# Features and labels
X = encoded_data.drop("class_label", axis=1)
y = encoded_data["class_label"]

# Define splits
splits = {
    "20/80": (0.2, 0.8),
    "50/50": (0.5, 0.5),
    "80/20": (0.8, 0.2),
}

# Models to evaluate
models = {
    "SVM": SVC(kernel="linear"),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Decision Tree": DecisionTreeClassifier(),
}

# Results storage
results = {}

# Perform train-test splits and evaluations
for split_name, (train_size, test_size) in splits.items():
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_size, test_size=test_size, random_state=42)
    
    results[split_name] = {}
    
    for model_name, model in models.items():
        # Cross-validation scores
        cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy")
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        test_accuracy = accuracy_score(y_test, y_pred)
        
        # Calculate Weighted Average Accuracy
        weighted_accuracy = (
            (cv_scores.mean() * len(X_train) + test_accuracy * len(X_test))
            / (len(X_train) + len(X_test))
        )
        
        # Store results
        results[split_name][model_name] = {
            "Cross-validation Mean Accuracy": cv_scores.mean(),
            "Test Accuracy": test_accuracy,
            "Average Weighted Accuracy": weighted_accuracy,
        }

# Print the results
for split, models in results.items():
    print(f"\nResults for {split} split:")
    weighted_accuracies = []
    for model_name, metrics in models.items():
        print(f"  {model_name}:")
        print(f"    Cross-validation Mean Accuracy: {metrics['Cross-validation Mean Accuracy']:.4f}")
        print(f"    Test Accuracy: {metrics['Test Accuracy']:.4f}")
        print(f"    Average Weighted Accuracy: {metrics['Average Weighted Accuracy']:.4f}")
        weighted_accuracies.append(metrics["Average Weighted Accuracy"])
    
    # Calculate and display average weighted accuracy for the partition
    partition_average = sum(weighted_accuracies) / len(weighted_accuracies)
    print(f"  Average Weighted Accuracy for {split} partition: {partition_average:.4f}")


Results for 20/80 split:
  SVM:
    Cross-validation Mean Accuracy: 0.6754
    Test Accuracy: 0.7325
    Average Weighted Accuracy: 0.7211
  Random Forest:
    Cross-validation Mean Accuracy: 0.8667
    Test Accuracy: 0.9147
    Average Weighted Accuracy: 0.9051
  Decision Tree:
    Cross-validation Mean Accuracy: 0.9072
    Test Accuracy: 0.9443
    Average Weighted Accuracy: 0.9369
  Average Weighted Accuracy for 20/80 partition: 0.8544

Results for 50/50 split:
  SVM:
    Cross-validation Mean Accuracy: 0.7199
    Test Accuracy: 0.7350
    Average Weighted Accuracy: 0.7274
  Random Forest:
    Cross-validation Mean Accuracy: 0.9433
    Test Accuracy: 0.9745
    Average Weighted Accuracy: 0.9589
  Decision Tree:
    Cross-validation Mean Accuracy: 0.9548
    Test Accuracy: 0.9734
    Average Weighted Accuracy: 0.9641
  Average Weighted Accuracy for 50/50 partition: 0.8835

Results for 80/20 split:
  SVM:
    Cross-validation Mean Accuracy: 0.7301
    Test Accuracy: 0.6965
    Averag