In [1]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [5]:
# Load the dataset
df = pd.read_csv(r"C:\Users\Aftab\Downloads\beauty.csv", sep=";")

In [7]:
display(df)

Unnamed: 0,wage,exper,union,goodhlth,black,female,married,service,educ,looks
0,5.73,30,0,1,0,1,1,1,14,4
1,4.28,28,0,1,0,1,1,0,12,3
2,7.96,35,0,1,0,1,0,0,10,4
3,11.57,38,0,1,0,0,1,1,16,3
4,11.42,27,0,1,0,0,1,0,16,3
...,...,...,...,...,...,...,...,...,...,...
1255,1.61,25,0,1,1,1,0,1,12,3
1256,1.68,4,0,1,0,1,1,1,12,2
1257,3.29,35,0,1,1,1,0,1,12,3
1258,2.31,15,0,1,1,1,1,1,10,3


In [23]:
df.isna().sum()

wage        0
exper       0
union       0
goodhlth    0
black       0
female      0
married     0
service     0
educ        0
looks       0
cluster     0
dtype: int64

In [9]:
# Ignore 'wage' column (assuming it's the y column)
X = df.drop(columns=['wage'])

In [11]:
# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [13]:
# Perform clustering using KMeans
kmeans = KMeans(n_clusters=3, random_state=42)
df['cluster'] = kmeans.fit_predict(X_scaled)

In [15]:
# Split dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['cluster']), df['cluster'], test_size=0.2, random_state=42)

In [17]:
# Define classifiers and parameters for GridSearchCV
models = {
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(),
    "KNN": KNeighborsClassifier()
}

params = {
    "RandomForest": {"n_estimators": [50, 100, 150]},
    "SVM": {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]},
    "KNN": {"n_neighbors": [3, 5, 7]}
}

In [19]:
# Perform classification with hyperparameter tuning
best_models = {}
for model_name in models:
    grid_search = GridSearchCV(models[model_name], params[model_name], cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    best_models[model_name] = grid_search.best_estimator_
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")

Best parameters for RandomForest: {'n_estimators': 150}
Best parameters for SVM: {'C': 10, 'kernel': 'linear'}
Best parameters for KNN: {'n_neighbors': 3}


In [20]:
# Evaluate the models
for model_name, model in best_models.items():
    accuracy = model.score(X_test, y_test)
    print(f"{model_name} Accuracy: {accuracy:.2f}")

RandomForest Accuracy: 0.97
SVM Accuracy: 1.00
KNN Accuracy: 0.73
