In [6]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Sample data with a categorical column
data = {
    'Fruit': ['Apple', 'Banana', 'Orange', 'Banana', 'Apple', 'Orange']
}

# Create a DataFrame from the data
df = pd.DataFrame(data)

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the data using the OneHotEncoder
one_hot_encoded = encoder.fit_transform(df[['Fruit']])

# The result of fit_transform is a sparse matrix, so let's convert it to a DataFrame
# and provide column names for better understanding
columns = encoder.get_feature_names_out(['Fruit'])
one_hot_encoded_df = pd.DataFrame(one_hot_encoded.toarray(), columns=columns)

# Display the one-hot encoded DataFrame
print(one_hot_encoded_df)


   Fruit_Apple  Fruit_Banana  Fruit_Orange
0          1.0           0.0           0.0
1          0.0           1.0           0.0
2          0.0           0.0           1.0
3          0.0           1.0           0.0
4          1.0           0.0           0.0
5          0.0           0.0           1.0


In [8]:
columns

array(['Fruit_Apple', 'Fruit_Banana', 'Fruit_Orange'], dtype=object)

In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Convert the target labels to a 2D array (required by OneHotEncoder)
y = y.reshape(-1, 1)

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the target labels using the OneHotEncoder
y_one_hot = encoder.fit_transform(y).toarray()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Predict the target labels on the test data
y_pred = clf.predict(X_test)

# Convert the one-hot encoded predictions back to the original labels
y_pred_labels = encoder.inverse_transform(y_pred)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred_labels)
print("Accuracy:", accuracy)


In [None]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Convert the target labels to a 2D array (required by OneHotEncoder)
y = y.reshape(-1, 1)

# Initialize the OneHotEncoder
encoder = OneHotEncoder()

# Fit and transform the target labels using the OneHotEncoder
y_one_hot = encoder.fit_transform(y).toarray()

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_one_hot, test_size=0.2, random_state=42)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Predict the target labels on the test data
y_pred = clf.predict(X_test)

# Convert the one-hot encoded predictions back to the original labels
y_pred_labels = encoder.inverse_transform(y_pred)
y_test_labels = encoder.inverse_transform(y_test)

# Calculate the accuracy of the model (after converting y_pred_labels to one-hot encoded format)
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print("Accuracy:", accuracy)

In [27]:
y_one_hot.shape

(150, 3)

In [42]:
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the column transformer for one-hot encoding
# Assuming you want to one-hot encode the 4th feature (index 3)
# and leave the other features as they are (passthrough)
column_transformer = ColumnTransformer(
    transformers=[('encoder', OneHotEncoder(), [3])],
    remainder='passthrough'
)

# Fit and transform the feature data using the column transformer only on the training set
X_train_encoded = column_transformer.fit_transform(X_train)

# Transform the feature data on the test set (without fitting again)
X_test_encoded = column_transformer.transform(X_test)

# Initialize the Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

# Train the classifier on the one-hot encoded training data
clf.fit(X_train_encoded, y_train)

# Predict the target labels on the one-hot encoded test data
y_pred = clf.predict(X_test_encoded)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


In [41]:
X_train_encoded.shape

(120, 25)