In [12]:
!pip install --upgrade scikit-learn --user




In [1]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from typing import List

def imput_onehot(datas: pd.DataFrame, column_list: List[str]) -> pd.DataFrame:
    data=datas
    for column_name in column_list:
        # Replace empty strings with NaN
        data[column_name] = data[column_name].replace('', np.NaN)
        
        # Create the imputer instance with the most frequent strategy
        imput = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
        
        # Fit and transform the data, then flatten the resulting array
        data[column_name] = imput.fit_transform(data[column_name].values.reshape(-1, 1)).flatten()
        
        # Perform one-hot encoding
        encoder = OneHotEncoder(sparse_output=False, drop='first')
        encoded_data = encoder.fit_transform(data[column_name].values.reshape(-1, 1))
        
        # Create a DataFrame with the encoded data
        encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out([column_name]))
        
        # Concatenate the encoded columns with the original DataFrame
        data = pd.concat([data, encoded_df], axis=1)
        
        # Drop the original column
        data = data.drop(columns=[column_name])
        
    return data

In [2]:
# Example usage
df = pd.read_csv('stud.csv')
df.loc[:4, 'race/ethnicity'] = ''
df.loc[:4, 'parental level of education'] = ''
df.loc[:4, 'lunch'] = ''
cols = ['lunch', 'race/ethnicity', 'parental level of education']
df_en = imput_onehot(df, cols)

In [3]:
df_en.head(10)

Unnamed: 0,gender,test preparation course,math score,reading score,writing score,lunch_standard,race/ethnicity_group B,race/ethnicity_group C,race/ethnicity_group D,race/ethnicity_group E,parental level of education_bachelor's degree,parental level of education_high school,parental level of education_master's degree,parental level of education_some college,parental level of education_some high school
0,female,none,72,72,74,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,female,completed,69,90,88,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,female,none,90,95,93,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,male,none,47,57,44,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,male,none,76,78,75,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5,female,none,71,83,78,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,female,completed,88,95,92,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7,male,none,40,43,39,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
8,male,completed,64,64,67,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
9,female,none,38,60,50,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [4]:
import sklearn
print(sklearn.__version__)

1.5.0
