In [1]:
import pandas as pd
import numpy as np


In [28]:
data = pd.read_csv('foodservice_customer_data.csv')
data.head()

Unnamed: 0,customer_id,business_name,category,subcategory,scale,location,years_in_business,employees,estimated_daily_customers,avg_order_size,order_frequency,last_order_date,total_orders_ytd,storage_capacity,sustainability_focus,quality_preference,recommended_products,primary_product_categories
0,CUST_0001,Business_1,Institution,Hospital,Large,Urban,11,108,366,10115.75,Weekly,2024-12-24,25,Large,True,Economy,"Dairy, Fruits, Vegetables","Vegetables, Fruits, Dairy, Grains"
1,CUST_0002,Business_2,Catering,Meal Delivery Service,Large,Rural,26,111,999,38455.85,Bi-weekly,2024-12-24,32,Large,True,Standard,"Meat, Vegetables, Beverages","Vegetables, Meat, Seafood, Beverages"
2,CUST_0003,Business_3,Bakery,Pastry Shop,Small,Suburban,11,11,55,856.86,Bi-weekly,2024-12-24,44,Large,False,Economy,"Dairy, Fruits, Grains","Grains, Dairy, Fruits"
3,CUST_0004,Business_4,Hotel,Business Hotel,Small,Rural,12,7,34,506.0,Bi-weekly,2024-12-24,35,Limited,False,Standard,"Meat, Vegetables, Seafood","Vegetables, Fruits, Meat, Seafood, Dairy, Beve..."
4,CUST_0005,Business_5,Institution,Corporate Cafeteria,Large,Suburban,14,146,603,20708.16,Bi-weekly,2024-12-24,18,Large,False,Standard,"Grains, Vegetables, Fruits","Vegetables, Fruits, Dairy, Grains"


In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib

columns_to_drop=['customer_id', 'business_name', 'last_order_date', 'total_orders_ytd', 'order_frequency', 'recommended_products']

data = data.drop(columns=columns_to_drop)

In [30]:
# Split the categories into a list for each row
data['primary_product_categories_split'] = data['primary_product_categories'].str.split(', ')

# Extract unique product categories
unique_categories = set(
    cat for sublist in data['primary_product_categories_split'] for cat in sublist
)

# Multi-hot encode each unique category
for category in unique_categories:
    data[f'product_{category}'] = data['primary_product_categories_split'].apply(
        lambda x: 1 if category in x else 0
)

# Drop the intermediate column
data.drop(columns=['primary_product_categories_split'], inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   category                    100 non-null    object 
 1   subcategory                 100 non-null    object 
 2   scale                       100 non-null    object 
 3   location                    100 non-null    object 
 4   years_in_business           100 non-null    int64  
 5   employees                   100 non-null    int64  
 6   estimated_daily_customers   100 non-null    int64  
 7   avg_order_size              100 non-null    float64
 8   storage_capacity            100 non-null    object 
 9   sustainability_focus        100 non-null    bool   
 10  quality_preference          100 non-null    object 
 11  primary_product_categories  100 non-null    object 
 12  product_Seafood             100 non-null    int64  
 13  product_Grains              100 non-

In [31]:
data=data.drop(columns=['primary_product_categories'])

In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
import pandas as pd

# Define columns for preprocessing
categorical_cols = ['category', 'subcategory', 'location']
ordinal_cols = ['scale', 'storage_capacity', 'quality_preference']
numerical_cols = ['years_in_business', 'employees', 'estimated_daily_customers', 'avg_order_size']
boolean_cols = ['sustainability_focus']

# Define pipelines for feature types
categorical_pipeline = Pipeline(steps=[('onehot', OneHotEncoder(drop='first', sparse_output=False))])
ordinal_pipeline = Pipeline(steps=[('ordinal', OrdinalEncoder())])
numerical_pipeline = Pipeline(steps=[('scaler', StandardScaler())])
boolean_pipeline = Pipeline(steps=[('boolean_to_int', OrdinalEncoder(categories=[[False, True]]))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols),
        ('ord', ordinal_pipeline, ordinal_cols),
        ('bool', boolean_pipeline, boolean_cols)
    ]
)

# Assuming 'data' is a pandas DataFrame
X = data.drop(columns=[  # Drop the multi-label target columns from the original dataframe
    'product_Vegetables', 'product_Fruits', 'product_Dairy', 'product_Grains', 'product_Beverages',
    'product_Seafood', 'product_Meat', 'product_Frozen Foods'
])

y = data[['product_Vegetables', 'product_Fruits', 'product_Dairy', 'product_Grains', 'product_Beverages',
        'product_Seafood', 'product_Meat', 'product_Frozen Foods']]  # Define your target labels

# Apply preprocessing to the feature set X
X_transformed = preprocessor.fit_transform(X)

# Now, X_transformed can be used for training the model, and y is your target variable




In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

# Initialize Random Forest model inside MultiOutputClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
multi_output_model = MultiOutputClassifier(rf_model)

# Train the model
multi_output_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = multi_output_model.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        17
           3       1.00      1.00      1.00        14
           4       1.00      1.00      1.00         6
           5       1.00      1.00      1.00         6
           6       1.00      1.00      1.00         9
           7       0.00      0.00      0.00         0

   micro avg       0.99      1.00      0.99        81
   macro avg       0.87      0.88      0.87        81
weighted avg       0.99      1.00      0.99        81
 samples avg       0.99      1.00      0.99        81



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [34]:
# Assuming the multi-class labels are encoded as follows:
class_labels = [
    "Vegetables", "Fruits", "Dairy", "Grains",
    "Meat", "Frozen Foods", "Seafood", "Beverages"
    # Add other labels here if needed
]

def convert_predictions_to_string(predictions):
    result = []
    for pred in predictions:
        # For each prediction, check which categories are predicted (value = 1)
        selected_categories = [
            class_labels[i] for i in range(len(pred)) if pred[i] == 1
        ]
        # Join the categories into a string like "Vegetables, Dairy"
        result.append(", ".join(selected_categories))
    return result

# Example predictions from the model (e.g., [1, 0, 1, 0] for "Vegetables" and "Dairy")
y_pred = multi_output_model.predict(X_test)

# Convert the predictions to the string format
string_predictions = convert_predictions_to_string(y_pred)

def convert_predictions_to_string(predictions):
    result = []
    for pred in predictions:
        # For each prediction, check which categories are predicted (value = 1)
        selected_categories = [
            class_labels[i] for i in range(len(pred)) if pred[i] == 1
        ]
        # Join the categories into a string like "Vegetables, Dairy"
        result.append(", ".join(selected_categories))
    return result

# Example predictions from the model (e.g., [1, 0, 1, 0] for "Vegetables" and "Dairy")
y_pred = multi_output_model.predict(X_test)

# Convert the predictions to the string format
string_predictions = convert_predictions_to_string(y_pred)

In [35]:
print(string_predictions)

['Vegetables, Dairy, Grains, Seafood', 'Vegetables, Fruits, Dairy, Grains', 'Vegetables, Meat, Frozen Foods, Seafood', 'Vegetables, Fruits, Dairy, Grains', 'Vegetables, Dairy, Grains, Seafood', 'Vegetables, Dairy, Grains, Seafood', 'Fruits, Dairy, Grains', 'Fruits, Dairy, Grains', 'Vegetables, Meat, Frozen Foods, Seafood', 'Vegetables, Fruits, Dairy, Grains', 'Vegetables, Meat, Frozen Foods, Seafood', 'Vegetables, Fruits, Dairy, Grains', 'Vegetables, Fruits, Dairy, Meat, Frozen Foods, Seafood', 'Vegetables, Fruits, Dairy, Grains', 'Vegetables, Fruits, Dairy, Grains', 'Vegetables, Fruits, Dairy, Grains', 'Vegetables, Fruits, Dairy, Meat, Frozen Foods, Seafood', 'Fruits, Dairy, Grains', 'Fruits, Dairy, Grains', 'Vegetables, Fruits, Dairy, Meat, Frozen Foods, Seafood']
