Importing the dataset from UCI website

In [1]:
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split

# Fetching the dataset
mushroom = fetch_ucirepo(id=73)

# Data
X_data = mushroom.data.features
y_data = mushroom.data.targets



Cleaning the data, using "One-Hot-Encoding" and using impute for missing data

In [7]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

df = pd.DataFrame(data=X_data, columns=mushroom.feature_names)
df['class'] = y_data

# One-hot encoding to the categorical features
categorical_cols = df.columns[:-1]  # Excluding the target variable
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# Imputing the missing values
imputer = SimpleImputer(strategy='most_frequent')
df_imputed = pd.DataFrame(imputer.fit_transform(df_encoded), columns=df_encoded.columns)

# Splitting the data into features (X) and target variable (y)
X = df_imputed.drop('class', axis=1)
y = df_imputed['class']

Splitting the data

In [8]:
from sklearn.model_selection import train_test_split

# Splitting the data into 80% training set, 20% temporary set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Again, splitting the temporary set into 50% validation set and 50% test set
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Printing the sizes of the sets
print("\nTraining set size:", len(X_train))
print("Validation set size:", len(X_valid))
print("Test set size:", len(X_test))

# Printing the data for each set
print("\nTraining set:")
print("X_train:", X_train)
print("y_train:", y_train)

print("\nValidation set:")
print("X_valid:", X_valid)
print("y_valid:", y_valid)

print("\nTest set:")
print("X_test:", X_test)
print("y_test:", y_test)

# Metadata
print(mushroom.metadata)

# Variable information
print(mushroom.variables)


Training set size: 6499
Validation set size: 812
Test set size: 813

Training set:
X_train:      cap-shape_b cap-shape_c cap-shape_f cap-shape_k cap-shape_s cap-shape_x  \
7873           0           0           0           1           0           0   
6515           0           0           0           0           0           1   
6141           0           0           1           0           0           0   
2764           0           0           1           0           0           0   
438            1           0           0           0           0           0   
...          ...         ...         ...         ...         ...         ...   
5226           0           0           0           0           0           1   
5390           0           0           0           1           0           0   
860            0           0           1           0           0           0   
7603           0           0           0           1           0           0   
7270           0           

Applying Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo

# Applying logistic regression
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)

# Making predictions on the testing set
y_pred = logistic_model.predict(X_test)

# Calculating and printing the accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy on the testing set:", accuracy)

print("Predictions on the test set:")
print(y_pred)

Accuracy on the testing set: 1.0
Predictions on the test set:
['p' 'e' 'p' 'e' 'p' 'e' 'p' 'p' 'e' 'e' 'p' 'p' 'p' 'p' 'p' 'p' 'p' 'e'
 'e' 'p' 'p' 'e' 'e' 'p' 'e' 'e' 'e' 'e' 'p' 'e' 'p' 'p' 'p' 'p' 'p' 'e'
 'e' 'p' 'e' 'p' 'p' 'p' 'e' 'e' 'e' 'p' 'p' 'p' 'e' 'e' 'e' 'e' 'e' 'e'
 'e' 'e' 'p' 'p' 'p' 'e' 'e' 'e' 'e' 'e' 'p' 'e' 'e' 'p' 'p' 'p' 'p' 'e'
 'e' 'p' 'p' 'e' 'e' 'e' 'e' 'p' 'e' 'p' 'e' 'p' 'e' 'p' 'e' 'e' 'e' 'p'
 'p' 'p' 'e' 'e' 'p' 'p' 'p' 'p' 'e' 'p' 'e' 'p' 'p' 'e' 'e' 'e' 'p' 'p'
 'p' 'e' 'e' 'p' 'p' 'p' 'p' 'e' 'e' 'e' 'e' 'e' 'e' 'p' 'p' 'e' 'p' 'p'
 'p' 'e' 'e' 'e' 'p' 'e' 'e' 'e' 'p' 'p' 'e' 'p' 'p' 'p' 'e' 'e' 'e' 'e'
 'p' 'e' 'e' 'p' 'e' 'p' 'p' 'e' 'e' 'p' 'p' 'p' 'p' 'e' 'e' 'e' 'p' 'p'
 'e' 'p' 'p' 'e' 'e' 'e' 'p' 'e' 'p' 'e' 'e' 'e' 'p' 'e' 'e' 'e' 'p' 'e'
 'p' 'e' 'e' 'p' 'p' 'e' 'e' 'p' 'p' 'e' 'e' 'e' 'e' 'e' 'e' 'p' 'e' 'e'
 'e' 'e' 'p' 'p' 'p' 'p' 'e' 'e' 'p' 'e' 'p' 'p' 'e' 'e' 'p' 'p' 'e' 'e'
 'e' 'p' 'e' 'p' 'p' 'p' 'e' 'e' 'p' 'e' 'e' 'e' 'e' 'e' 'e' '