# Import Required Libraries
Import the necessary libraries, including Pandas and NumPy.

In [None]:
# Import necessary libraries
import pandas as pd # Import Pandas library and alias it as pd
import numpy as np # Import NumPy library and alias it as np

# Import data
Import the pumpkins dataset in the notebook.

In [None]:
# Read CSV file from URL
url = "https://raw.githubusercontent.com/microsoft/ML-For-Beginners/main/2-Regression/data/US-pumpkins.csv" 
full_pumpkins = pd.read_csv(url)

 full_pumpkins.head() # Show first 5 rows of the dataframe



# Data cleaning and visualization
Let's clean and visualize the data 

In [None]:
columns_to_select = ['City Name','Package','Variety', 'Origin','Item Size', 'Color']
pumpkins = full_pumpkins.loc[:, columns_to_select]

pumpkins.dropna(inplace=True)

In [None]:
import seaborn as sns
palette = {
    'ORANGE': 'orange',
    'WHITE': 'wheat',
}

sns.catplot(
    data=pumpkins, y="Variety", hue="Color", kind="count", palette=palette
)

# Data preparation: features and label encoding
Let's prepare the data for modeling, by encoding categorical values and splitting the data into training and test sets.

In [None]:
from sklearn.preprocessing import OneHotEncoder

categorical_features = ['City Name', 'Package', 'Variety', 'Origin']
categorical_encoder = OneHotEncoder(sparse_output=False)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

item_sizes_categories = [['sml', 'med', 'med-lge', 'lge', 'xlge', 'jbo', 'exjbo']]
ordinal_features = ['Item Size']
ordinal_encoder = OrdinalEncoder(categories=item_size_categories)

In [None]:
from sklearn.compose import ColumnTransformer
    
ct = ColumnTransformer(transformers=[
    ('ord', ordinal_encoder, ordinal_features),
    ('cat', categorical_encoder, categorical_features)
    ])
    
ct.set_output(transform='pandas')
encoded_features = ct.fit_transform(pumpkins)

In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
encoded_label = label_encoder.fit_transform(pumpkins['Color'])

encoded_pumpkins = encoded_features.assign(Color=encoded_label)

In [None]:
from sklearn.model_selection import train_test_split

X = encoded_pumpkins[encoded_pumpkins.columns.difference(['Color'])]
y = encoded_pumpkins['Color']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model training and evaluation
Let's train a model and evaluate its performance against the test set.

In [None]:
from sklearn.metrics import f1_score, classification_report 
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

print(classification_report(y_test, predictions))
print('Predicted labels: ', predictions)
print('F1-score: ', f1_score(y_test, predictions))