In [56]:
import pandas as pd

## 1. Loading dataset

In [57]:
music_data = pd.read_csv('music.csv')
music_data

Unnamed: 0,age,gender,genre
0,20.0,1,HipHop
1,23.0,1,HipHop
2,25.0,1,HipHop
3,26.0,1,Jazz
4,29.0,1,Jazz
5,30.0,1,Jazz
6,30.0,1,Jazz
7,31.0,1,Classical
8,33.0,1,Classical
9,37.0,1,Classical


## 2. Clean the data

### Remove duplicates

In [58]:
music_data[music_data.duplicated()]

Unnamed: 0,age,gender,genre
6,30.0,1,Jazz
19,35.0,0,Classical


In [59]:
music_data.shape

(21, 3)

In [60]:
music_data = music_data.drop_duplicates()
music_data.shape

(19, 3)

### Impute missing values

In [61]:
from sklearn.impute import KNNImputer

In [62]:
imputer = KNNImputer(n_neighbors=3)
imputer.fit_transform(music_data)

ValueError: could not convert string to float: 'HipHop'

In [63]:
music_data['genre'].unique()

array(['HipHop', 'Jazz', 'Classical', 'Dance', 'Acoustic'], dtype=object)

The error you're encountering, `ValueError: could not convert string to float: 'HipHop'` occurs because the KNNImputer expects numerical input data, and it cannot handle string values.

## Encode data
Encode the string values in your target column into numerical values. Then apply KNNImputer to fill NaN values.

In [64]:
from sklearn.preprocessing import OneHotEncoder

In [65]:
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform='pandas')

In [66]:
genre_transformed_data = enc.fit_transform(music_data[['genre']])
genre_transformed_data

Unnamed: 0,genre_Acoustic,genre_Classical,genre_Dance,genre_HipHop,genre_Jazz
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,1.0
7,0.0,1.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0
10,0.0,0.0,1.0,0.0,0.0


In [67]:
encoded_music_data = pd.concat([music_data, genre_transformed_data], axis=1).drop(columns=['genre'])
encoded_music_data

Unnamed: 0,age,gender,genre_Acoustic,genre_Classical,genre_Dance,genre_HipHop,genre_Jazz
0,20.0,1,0.0,0.0,0.0,1.0,0.0
1,23.0,1,0.0,0.0,0.0,1.0,0.0
2,25.0,1,0.0,0.0,0.0,1.0,0.0
3,26.0,1,0.0,0.0,0.0,0.0,1.0
4,29.0,1,0.0,0.0,0.0,0.0,1.0
5,30.0,1,0.0,0.0,0.0,0.0,1.0
7,31.0,1,0.0,1.0,0.0,0.0,0.0
8,33.0,1,0.0,1.0,0.0,0.0,0.0
9,37.0,1,0.0,1.0,0.0,0.0,0.0
10,20.0,0,0.0,0.0,1.0,0.0,0.0


## 3. Split dataset into train and test datasets

In [68]:
from sklearn.model_selection import train_test_split

X = encoded_music_data[['age', 'gender']]
X

Unnamed: 0,age,gender
0,20.0,1
1,23.0,1
2,25.0,1
3,26.0,1
4,29.0,1
5,30.0,1
7,31.0,1
8,33.0,1
9,37.0,1
10,20.0,0


In [69]:
y = encoded_music_data.drop(columns=['age', 'gender'])
y

Unnamed: 0,genre_Acoustic,genre_Classical,genre_Dance,genre_HipHop,genre_Jazz
0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,0.0,0.0,1.0
7,0.0,1.0,0.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0
9,0.0,1.0,0.0,0.0,0.0
10,0.0,0.0,1.0,0.0,0.0


In [70]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=10)
X_train

Unnamed: 0,age,gender
14,27.0,0
9,37.0,1
15,30.0,0
17,34.0,0
20,,0
13,26.0,0
12,25.0,0
1,23.0,1
0,20.0,1
16,31.0,0


## 4. Create a model

In [71]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()

## 5. Train models

In [77]:
model.fit(X_train, y_train)

## 6. Make predictions

In [78]:
predictions = model.predict(X_test)
predictions

array([[0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.]])

In [80]:
X_test

Unnamed: 0,age,gender
3,26.0,1
8,33.0,1
18,35.0,0
5,30.0,1
7,31.0,1
2,25.0,1
11,21.0,0


In [81]:
y_test

Unnamed: 0,genre_Acoustic,genre_Classical,genre_Dance,genre_HipHop,genre_Jazz
3,0.0,0.0,0.0,0.0,1.0
8,0.0,1.0,0.0,0.0,0.0
18,0.0,1.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,1.0
7,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0
11,0.0,0.0,1.0,0.0,0.0


## 7. Evaluate

In [83]:
from sklearn.metrics import accuracy_score

accuracy_score  = accuracy_score(y_test, predictions)
accuracy_score

1.0