In [1]:
pip install ucimlrepo

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
raisin = fetch_ucirepo(id=850) 
  
# data (as pandas dataframes) 
X = raisin.data.features 
y = raisin.data.targets 
  
# metadata 
print(raisin.metadata) 
  
# variable information 
print(raisin.variables) 

{'uci_id': 850, 'name': 'Raisin', 'repository_url': 'https://archive.ics.uci.edu/dataset/850/raisin', 'data_url': 'https://archive.ics.uci.edu/static/public/850/data.csv', 'abstract': 'Images of the Kecimen and Besni raisin varieties were obtained with CVS. A total of 900 raisins were used, including 450 from both varieties, and 7 morphological features were extracted.', 'area': 'Biology', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 900, 'num_features': 7, 'feature_types': ['Real', 'Integer'], 'demographics': [], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Fri Jan 05 2024', 'dataset_doi': '10.24432/C5660T', 'creators': ['İ̇lkay Çinar', 'Murat Koklu', 'Sakir Tasdemir'], 'intro_paper': {'ID': 261, 'type': 'NATIVE', 'title': 'Kuru Üzüm Tanelerinin Makine Görüşü ve Yapay Zeka Yöntemleri Kullanılarak Sınıflandırılması', 'authors': 'İ̇lkay Çinar

In [3]:
print(X.head())
print(y.head())

print(X.isnull().sum())

print(y.value_counts())

    Area  MajorAxisLength  MinorAxisLength  Eccentricity  ConvexArea  \
0  87524       442.246011       253.291155      0.819738       90546   
1  75166       406.690687       243.032436      0.801805       78789   
2  90856       442.267048       266.328318      0.798354       93717   
3  45928       286.540559       208.760042      0.684989       47336   
4  79408       352.190770       290.827533      0.564011       81463   

     Extent  Perimeter  
0  0.758651   1184.040  
1  0.684130   1121.786  
2  0.637613   1208.575  
3  0.699599    844.162  
4  0.792772   1073.251  
     Class
0  Kecimen
1  Kecimen
2  Kecimen
3  Kecimen
4  Kecimen
Area               0
MajorAxisLength    0
MinorAxisLength    0
Eccentricity       0
ConvexArea         0
Extent             0
Perimeter          0
dtype: int64
Class  
Besni      450
Kecimen    450
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

y_encoded = LabelEncoder().fit_transform(y.values.ravel())

splits = {
    "20/80": train_test_split(X, y_encoded, train_size=0.2, random_state=42, stratify=y_encoded),
    "50/50": train_test_split(X, y_encoded, train_size=0.5, random_state=42, stratify=y_encoded),
    "80/20": train_test_split(X, y_encoded, train_size=0.8, random_state=42, stratify=y_encoded)
}

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

def tune_hyperparameters(X_train, y_train, X_train_scaled, clf_name):
    if clf_name == "Random Forest":
        param_grid = {"max_depth": [10, 20, 30]}
        model = RandomForestClassifier(random_state=42)
    elif clf_name == "SVM":
        param_grid = {"C": [0.1, 1, 10, 100]}
        model = SVC(kernel="rbf", random_state=42)
    elif clf_name == "Logistic Regression":
        param_grid = {"C": [0.1, 1, 10, 100]}
        model = LogisticRegression(random_state=42, max_iter=1000)

    grid = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train_scaled if clf_name in ["SVM", "Logistic Regression"] else X_train, y_train)
    return grid.best_params_

best_params = {}

for split_name, (X_train, X_test, y_train, y_test) in splits.items():
    X_train_scaled = StandardScaler().fit_transform(X_train)

    best_params[split_name] = {
        "Random Forest": tune_hyperparameters(X_train, y_train, X_train_scaled, "Random Forest"),
        "SVM": tune_hyperparameters(X_train, y_train, X_train_scaled, "SVM"),
        "Logistic Regression": tune_hyperparameters(X_train, y_train, X_train_scaled, "Logistic Regression"),
    }

In [6]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

train_acc = {}
val_acc = {}
test_acc = {}

for split_name, (X_train, X_test, y_train, y_test) in splits.items():
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    rf_params = best_params[split_name]["Random Forest"]
    svm_params = best_params[split_name]["SVM"]
    lr_params = best_params[split_name]["Logistic Regression"]

    rf = RandomForestClassifier(random_state=42, **rf_params)
    svm = SVC(random_state=42, **svm_params)
    lr = LogisticRegression(random_state=42, max_iter=1000, **lr_params)

    rf.fit(X_train, y_train)
    svm.fit(X_train_scaled, y_train)
    lr.fit(X_train_scaled, y_train)

    rf_val_acc = cross_val_score(rf, X_train, y_train, cv=3, scoring="accuracy").mean()
    svm_val_acc = cross_val_score(svm, X_train_scaled, y_train, cv=3, scoring="accuracy").mean()
    lr_val_acc = cross_val_score(lr, X_train_scaled, y_train, cv=3, scoring="accuracy").mean()

    train_acc[split_name] = {
        "Random Forest": accuracy_score(y_train, rf.predict(X_train)),
        "SVM": accuracy_score(y_train, svm.predict(X_train_scaled)),
        "Logistic Regression": accuracy_score(y_train, lr.predict(X_train_scaled)),
    }

    val_acc[split_name] = {
        "Random Forest": rf_val_acc,
        "SVM": svm_val_acc,
        "Logistic Regression": lr_val_acc,
    }

    test_acc[split_name] = {
        "Random Forest": accuracy_score(y_test, rf.predict(X_test)),
        "SVM": accuracy_score(y_test, svm.predict(X_test_scaled)),
        "Logistic Regression": accuracy_score(y_test, lr.predict(X_test_scaled)),
    }

In [7]:
import pandas as pd

results = []

for split_name in splits.keys():
    for clf_name in ["Random Forest", "SVM", "Logistic Regression"]:
        results.append({
            "Classifier": clf_name,
            "Train/Test": split_name,
            "Training Accuracy": train_acc[split_name][clf_name],
            "Validation Accuracy": val_acc[split_name][clf_name],
            "Testing Accuracy": test_acc[split_name][clf_name],
            "Best Hyperparameter": best_params[split_name][clf_name]
        })

results_df = pd.DataFrame(results)

print(results_df)

            Classifier Train/Test  Training Accuracy  Validation Accuracy  \
0        Random Forest      20/80           1.000000             0.838889   
1                  SVM      20/80           0.866667             0.866667   
2  Logistic Regression      20/80           0.877778             0.877778   
3        Random Forest      50/50           0.997778             0.873333   
4                  SVM      50/50           0.886667             0.880000   
5  Logistic Regression      50/50           0.882222             0.871111   
6        Random Forest      80/20           1.000000             0.852778   
7                  SVM      80/20           0.869444             0.865278   
8  Logistic Regression      80/20           0.862500             0.866667   

   Testing Accuracy Best Hyperparameter  
0          0.845833   {'max_depth': 10}  
1          0.856944            {'C': 1}  
2          0.863889          {'C': 100}  
3          0.831111   {'max_depth': 10}  
4          0.848889

In [8]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
early_stage_diabetes_risk_prediction = fetch_ucirepo(id=529) 
  
# data (as pandas dataframes) 
X_1 = early_stage_diabetes_risk_prediction.data.features 
y_1 = early_stage_diabetes_risk_prediction.data.targets 
  
# metadata 
print(early_stage_diabetes_risk_prediction.metadata) 
  
# variable information 
print(early_stage_diabetes_risk_prediction.variables) 

{'uci_id': 529, 'name': 'Early Stage Diabetes Risk Prediction', 'repository_url': 'https://archive.ics.uci.edu/dataset/529/early+stage+diabetes+risk+prediction+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/529/data.csv', 'abstract': 'This dataset contains the sign and symptpom data of newly diabetic or would be diabetic patient. ', 'area': 'Computer Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 520, 'num_features': 16, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Gender'], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2020, 'last_updated': 'Mon Mar 04 2024', 'dataset_doi': '10.24432/C5VG8H', 'creators': [], 'intro_paper': {'ID': 397, 'type': 'NATIVE', 'title': 'Likelihood Prediction of Diabetes at Early Stage Using Data Mining Techniques', 'authors': 'M. M. F. Islam, Rahatara Ferdousi, Sadikur Rahman, Humayra Yas

In [9]:
print(X_1.head())
print(y_1.head())

   age gender polyuria polydipsia sudden_weight_loss weakness polyphagia  \
0   40   Male       No        Yes                 No      Yes         No   
1   58   Male       No         No                 No      Yes         No   
2   41   Male      Yes         No                 No      Yes        Yes   
3   45   Male       No         No                Yes      Yes        Yes   
4   60   Male      Yes        Yes                Yes      Yes        Yes   

  genital_thrush visual_blurring itching irritability delayed_healing  \
0             No              No     Yes           No             Yes   
1             No             Yes      No           No              No   
2             No              No     Yes           No             Yes   
3            Yes              No     Yes           No             Yes   
4             No             Yes     Yes          Yes             Yes   

  partial_paresis muscle_stiffness alopecia obesity  
0              No              Yes      Yes     Ye

In [10]:
y_1_encoded = LabelEncoder().fit_transform(y_1.values.ravel())

X_1_encoded = X_1.copy()
for col in X_1.columns:
    if X_1[col].dtype == 'object':
        X_1_encoded[col] = LabelEncoder().fit_transform(X_1[col])

X_1_scaled = StandardScaler().fit_transform(X_1_encoded)

In [11]:
splits_1 = {
    "20/80": train_test_split(X_1_scaled, y_1_encoded, train_size=0.2, random_state=42, stratify=y_1_encoded),
    "50/50": train_test_split(X_1_scaled, y_1_encoded, train_size=0.5, random_state=42, stratify=y_1_encoded),
    "80/20": train_test_split(X_1_scaled, y_1_encoded, train_size=0.8, random_state=42, stratify=y_1_encoded)
}

In [12]:
def tune_hyperparameters(X_train, y_train, clf_name):
    if clf_name == "Random Forest":
        param_grid = {"max_depth": [10, 20, 30]}
        model = RandomForestClassifier(random_state=42)
    elif clf_name == "SVM":
        param_grid = {"C": [0.1, 1, 10, 100]}
        model = SVC(kernel="rbf", random_state=42)
    elif clf_name == "Logistic Regression":
        param_grid = {"C": [0.1, 1, 10, 100]}
        model = LogisticRegression(random_state=42, max_iter=1000)

    grid = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid.best_params_

best_params_1 = {}

for split_name, (X_train, X_test, y_train, y_test) in splits_1.items():
    best_params_1[split_name] = {
        "Random Forest": tune_hyperparameters(X_train, y_train, "Random Forest"),
        "SVM": tune_hyperparameters(X_train, y_train, "SVM"),
        "Logistic Regression": tune_hyperparameters(X_train, y_train, "Logistic Regression"),
    }

In [13]:
train_acc_1 = {}
val_acc_1 = {}
test_acc_1 = {}

for split_name, (X_train, X_test, y_train, y_test) in splits_1.items():
    rf_params = best_params_1[split_name]["Random Forest"]
    svm_params = best_params_1[split_name]["SVM"]
    lr_params = best_params_1[split_name]["Logistic Regression"]

    rf = RandomForestClassifier(random_state=42, **rf_params)
    svm = SVC(random_state=42, **svm_params)
    lr = LogisticRegression(random_state=42, max_iter=1000, **lr_params)

    rf.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    lr.fit(X_train, y_train)

    rf_val_acc = cross_val_score(rf, X_train, y_train, cv=3, scoring="accuracy").mean()
    svm_val_acc = cross_val_score(svm, X_train, y_train, cv=3, scoring="accuracy").mean()
    lr_val_acc = cross_val_score(lr, X_train, y_train, cv=3, scoring="accuracy").mean()

    train_acc_1[split_name] = {
        "Random Forest": accuracy_score(y_train, rf.predict(X_train)),
        "SVM": accuracy_score(y_train, svm.predict(X_train)),
        "Logistic Regression": accuracy_score(y_train, lr.predict(X_train)),
    }

    val_acc_1[split_name] = {
        "Random Forest": rf_val_acc,
        "SVM": svm_val_acc,
        "Logistic Regression": lr_val_acc,
    }

    test_acc_1[split_name] = {
        "Random Forest": accuracy_score(y_test, rf.predict(X_test)),
        "SVM": accuracy_score(y_test, svm.predict(X_test)),
        "Logistic Regression": accuracy_score(y_test, lr.predict(X_test)),
    }

In [14]:
results_1 = []

for split_name in splits_1.keys():
    for clf_name in ["Random Forest", "SVM", "Logistic Regression"]:
        results_1.append({
            "Classifier": clf_name,
            "Train/Test": split_name,
            "Training Accuracy": train_acc_1[split_name][clf_name],
            "Validation Accuracy": val_acc_1[split_name][clf_name],
            "Testing Accuracy": test_acc_1[split_name][clf_name],
            "Best Hyperparameter": best_params_1[split_name][clf_name]
        })

results_df_1 = pd.DataFrame(results_1)

print(results_df_1)

            Classifier Train/Test  Training Accuracy  Validation Accuracy  \
0        Random Forest      20/80           1.000000             0.893557   
1                  SVM      20/80           0.990385             0.913165   
2  Logistic Regression      20/80           0.942308             0.913445   
3        Random Forest      50/50           1.000000             0.957676   
4                  SVM      50/50           0.984615             0.946093   
5  Logistic Regression      50/50           0.950000             0.919139   
6        Random Forest      80/20           1.000000             0.966375   
7                  SVM      80/20           0.995192             0.956748   
8  Logistic Regression      80/20           0.939904             0.918257   

   Testing Accuracy Best Hyperparameter  
0          0.944712   {'max_depth': 10}  
1          0.937500           {'C': 10}  
2          0.889423            {'C': 1}  
3          0.950000   {'max_depth': 10}  
4          0.961538

In [15]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X_2 = car_evaluation.data.features 
y_2 = car_evaluation.data.targets 
  
# metadata 
print(car_evaluation.metadata) 
  
# variable information 
print(car_evaluation.variables) 

{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'ID': 249, 'type': 'NATIVE', 'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'venue': '8th Intl Workshop on Expert Systems and their Applications, 

In [16]:
print(X_2.head())
print(y_2.head())

  buying  maint doors persons lug_boot safety
0  vhigh  vhigh     2       2    small    low
1  vhigh  vhigh     2       2    small    med
2  vhigh  vhigh     2       2    small   high
3  vhigh  vhigh     2       2      med    low
4  vhigh  vhigh     2       2      med    med
   class
0  unacc
1  unacc
2  unacc
3  unacc
4  unacc


In [17]:
y_2_binary = y_2.copy()
y_2_binary["class"] = y_2_binary["class"].apply(lambda x: 0 if x == "unacc" else 1)

X_2_encoded = X_2.copy()
for col in X_2.columns:
    X_2_encoded[col] = LabelEncoder().fit_transform(X_2[col])

In [18]:
splits_2 = {
    "20/80": train_test_split(X_2_encoded, y_2_binary["class"], train_size=0.2, random_state=42, stratify=y_2_binary["class"]),
    "50/50": train_test_split(X_2_encoded, y_2_binary["class"], train_size=0.5, random_state=42, stratify=y_2_binary["class"]),
    "80/20": train_test_split(X_2_encoded, y_2_binary["class"], train_size=0.8, random_state=42, stratify=y_2_binary["class"])
}

In [19]:
def tune_hyperparameters(X_train, y_train, clf_name):
    if clf_name == "Random Forest":
        param_grid = {"max_depth": [10, 20, 30]}
        model = RandomForestClassifier(random_state=42)
    elif clf_name == "SVM":
        param_grid = {"C": [0.1, 1, 10, 100]}
        model = SVC(kernel="rbf", random_state=42)
    elif clf_name == "Logistic Regression":
        param_grid = {"C": [0.1, 1, 10, 100]}
        model = LogisticRegression(random_state=42, max_iter=1000)

    grid = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)
    return grid.best_params_

best_params_2 = {}

for split_name, (X_train, X_test, y_train, y_test) in splits_2.items():
    best_params_2[split_name] = {
        "Random Forest": tune_hyperparameters(X_train, y_train, "Random Forest"),
        "SVM": tune_hyperparameters(X_train, y_train, "SVM"),
        "Logistic Regression": tune_hyperparameters(X_train, y_train, "Logistic Regression"),
    }

In [20]:
train_acc_2 = {}
val_acc_2 = {}
test_acc_2 = {}

for split_name, (X_train, X_test, y_train, y_test) in splits_2.items():
    rf_params = best_params_2[split_name]["Random Forest"]
    svm_params = best_params_2[split_name]["SVM"]
    lr_params = best_params_2[split_name]["Logistic Regression"]

    rf = RandomForestClassifier(random_state=42, **rf_params)
    svm = SVC(random_state=42, **svm_params)
    lr = LogisticRegression(random_state=42, max_iter=1000, **lr_params)

    rf.fit(X_train, y_train)
    svm.fit(X_train, y_train)
    lr.fit(X_train, y_train)

    rf_val_acc = cross_val_score(rf, X_train, y_train, cv=3, scoring="accuracy").mean()
    svm_val_acc = cross_val_score(svm, X_train, y_train, cv=3, scoring="accuracy").mean()
    lr_val_acc = cross_val_score(lr, X_train, y_train, cv=3, scoring="accuracy").mean()

    train_acc_2[split_name] = {
        "Random Forest": accuracy_score(y_train, rf.predict(X_train)),
        "SVM": accuracy_score(y_train, svm.predict(X_train)),
        "Logistic Regression": accuracy_score(y_train, lr.predict(X_train)),
    }

    val_acc_2[split_name] = {
        "Random Forest": rf_val_acc,
        "SVM": svm_val_acc,
        "Logistic Regression": lr_val_acc,
    }

    test_acc_2[split_name] = {
        "Random Forest": accuracy_score(y_test, rf.predict(X_test)),
        "SVM": accuracy_score(y_test, svm.predict(X_test)),
        "Logistic Regression": accuracy_score(y_test, lr.predict(X_test)),
    }

In [21]:
results_2 = []

for split_name in splits_2.keys():
    for clf_name in ["Random Forest", "SVM", "Logistic Regression"]:
        results_2.append({
            "Classifier": clf_name,
            "Train/Test": split_name,
            "Training Accuracy": train_acc_2[split_name][clf_name],
            "Validation Accuracy": val_acc_2[split_name][clf_name],
            "Testing Accuracy": test_acc_2[split_name][clf_name],
            "Best Hyperparameter": best_params_2[split_name][clf_name]
        })

results_df_2 = pd.DataFrame(results_2)

print(results_df_2)

            Classifier Train/Test  Training Accuracy  Validation Accuracy  \
0        Random Forest      20/80           1.000000             0.942029   
1                  SVM      20/80           0.997101             0.901449   
2  Logistic Regression      20/80           0.753623             0.730435   
3        Random Forest      50/50           1.000000             0.968750   
4                  SVM      50/50           1.000000             0.975694   
5  Logistic Regression      50/50           0.717593             0.712963   
6        Random Forest      80/20           1.000000             0.987692   
7                  SVM      80/20           1.000000             0.992042   
8  Logistic Regression      80/20           0.719247             0.711270   

   Testing Accuracy Best Hyperparameter  
0          0.952278   {'max_depth': 10}  
1          0.962401           {'C': 10}  
2          0.715835            {'C': 1}  
3          0.984954   {'max_depth': 20}  
4          0.988426