In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier

In [29]:
# Importing the data
file_path = 'Resources/store_data.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,food_category,food_department,food_family,store_sales_in_millions,store_cost_in_millions,unit_sales_in_millions,promotion_name,brand_name,SRP,...,grocery_sqft,frozen_sqft,meat_sqft,coffee_bar,video_store,salad_bar,prepared_food,florist,cost,media_type
0,0,Breakfast Foods,Frozen Foods,Food,7.36,2.7232,4.0,Bag Stuffers,Carrington,1.84,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,126.62,"Daily Paper, Radio"
1,1,Breakfast Foods,Frozen Foods,Food,5.52,2.5944,3.0,Cash Register Lottery,Carrington,1.84,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,59.86,"Daily Paper, Radio"
2,2,Breakfast Foods,Frozen Foods,Food,3.68,1.3616,2.0,High Roller Savings,Carrington,1.84,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,84.16,"Daily Paper, Radio"
3,3,Breakfast Foods,Frozen Foods,Food,3.68,1.1776,2.0,Cash Register Lottery,Carrington,1.84,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,95.78,In-Store Coupon
4,4,Breakfast Foods,Frozen Foods,Food,4.08,1.428,3.0,Double Down Sale,Golden,1.36,...,18670.0,5415.0,3610.0,1.0,1.0,1.0,1.0,1.0,50.79,Radio


In [30]:
df = df.drop(columns=['id', 'SRP', 'store_sqft', 'grocery_sqft', 'frozen_sqft', 'meat_sqft', 'coffee_bar',
       'video_store', 'salad_bar', 'prepared_food', 'florist', 'cost',
       'media_type'])
df.columns

Index(['food_category', 'food_department', 'food_family',
       'store_sales_in_millions', 'store_cost_in_millions',
       'unit_sales_in_millions', 'promotion_name', 'brand_name',
       'gross_weight', 'net_weight', 'recyclable_package', 'low_fat',
       'units_per_case', 'store_type', 'store_city', 'store_state'],
      dtype='object')

In [31]:
# Predict promo name(dep) based on food category(ind)

# Data prep...
# Encoding labels with scikit
le = LabelEncoder()
prepped_df = df.copy()

columns_to_encode = ['food_category', 'food_department', 'food_family', 'promotion_name', 'brand_name',
                    'store_type','store_city', 'store_state']

for label in columns_to_encode:
    prepped_df[label] = le.fit_transform(prepped_df[label])

prepped_df.head()

Unnamed: 0,food_category,food_department,food_family,store_sales_in_millions,store_cost_in_millions,unit_sales_in_millions,promotion_name,brand_name,gross_weight,net_weight,recyclable_package,low_fat,units_per_case,store_type,store_city,store_state
0,4,12,1,7.36,2.7232,4.0,0,20,19.7,17.7,1.0,0.0,17.0,0,5,1
1,4,12,1,5.52,2.5944,3.0,6,20,19.7,17.7,1.0,0.0,17.0,0,5,1
2,4,12,1,3.68,1.3616,2.0,18,20,19.7,17.7,1.0,0.0,17.0,0,5,1
3,4,12,1,3.68,1.1776,2.0,6,20,19.7,17.7,1.0,0.0,17.0,0,5,1
4,4,12,1,4.08,1.428,3.0,11,48,7.12,5.11,0.0,1.0,29.0,0,5,1


In [32]:
# Defining the features
X = prepped_df.copy()
X = X.drop('food_category', axis=1)

# Defining the target set
y = prepped_df['food_category'].values

print(X.head())
print(y[:5])

   food_department  food_family  store_sales_in_millions  \
0               12            1                     7.36   
1               12            1                     5.52   
2               12            1                     3.68   
3               12            1                     3.68   
4               12            1                     4.08   

   store_cost_in_millions  unit_sales_in_millions  promotion_name  brand_name  \
0                  2.7232                     4.0               0          20   
1                  2.5944                     3.0               6          20   
2                  1.3616                     2.0              18          20   
3                  1.1776                     2.0               6          20   
4                  1.4280                     3.0              11          48   

   gross_weight  net_weight  recyclable_package  low_fat  units_per_case  \
0         19.70       17.70                 1.0      0.0            17.0   


In [33]:
# Splitting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Checking the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(29169, 15)
(9723, 15)
(29169,)
(9723,)


In [34]:
# Scaling the data
# Creating scaler instance
scaler = StandardScaler()
# Fitrting it to the data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
# Fitting the model
from skmultilearn.problem_transform import LabelPowerset

classifier = LabelPowerset(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
)

classifier.fit(X_train_scaled, y_train)

LabelPowerset(classifier=RandomForestClassifier(), require_dense=[False, True])

In [36]:
# Making predictions!
predictions = classifier.predict(X_test_scaled)
predictions

<9723x1 sparse matrix of type '<class 'numpy.int64'>'
	with 9408 stored elements in List of Lists format>

In [37]:
predictions = predictions.toarray()
len(predictions)

9723

In [38]:
lp_f1=metrics.f1_score(y_test, predictions, average='micro')
lp_hamm=metrics.hamming_loss(y_test, predictions)
print('Label Powerset F1-score:',round(lp_f1,3))
print('Label Powerset Hamming Loss:',round(lp_hamm,3))

Label Powerset F1-score: 0.059
Label Powerset Hamming Loss: 0.941
