In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Importing the data
file_path = 'Resources/customer_data.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,id,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at_home_approx,avg_yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name
0,0,M,F,1.0,Partial High School,Normal,Skilled Manual,Y,1.0,$10K - $30K,1.0,Salem,OR,"Daily Paper, Radio",126.62,Bag Stuffers
1,1,M,M,0.0,Bachelors Degree,Silver,Professional,Y,4.0,$50K - $70K,0.0,Salem,OR,"Daily Paper, Radio",59.86,Cash Register Lottery
2,2,S,F,4.0,Partial High School,Normal,Manual,N,1.0,$10K - $30K,0.0,Salem,OR,"Daily Paper, Radio",84.16,High Roller Savings
3,3,M,F,2.0,High School Degree,Bronze,Manual,Y,2.0,$30K - $50K,2.0,Salem,OR,In-Store Coupon,95.78,Cash Register Lottery
4,4,M,M,0.0,Partial High School,Bronze,Skilled Manual,N,2.0,$30K - $50K,0.0,Salem,OR,Radio,50.79,Double Down Sale


In [3]:
df = df.drop(columns=['id'])
df.dtypes

marital_status              object
gender                      object
total_children             float64
education                   object
member_card                 object
occupation                  object
houseowner                  object
avg_cars_at_home_approx    float64
avg_yearly_income           object
num_children_at_home       float64
store_city                  object
store_state                 object
media_type                  object
cost                       float64
promotion_name              object
dtype: object

In [4]:
df.columns

Index(['marital_status', 'gender', 'total_children', 'education',
       'member_card', 'occupation', 'houseowner', 'avg_cars_at_home_approx',
       'avg_yearly_income', 'num_children_at_home', 'store_city',
       'store_state', 'media_type', 'cost', 'promotion_name'],
      dtype='object')

In [5]:
# Predict promo name(dep) based on food category(ind)

# Data prep...
# Encoding labels with scikit
le = LabelEncoder()
prepped_df = df.copy()

columns_to_encode = ['marital_status', 'gender', 'education', 'member_card', 'occupation', 'houseowner',
       'avg_yearly_income', 'store_city', 'store_state', 'media_type', 'promotion_name']

for label in columns_to_encode:
    prepped_df[label] = le.fit_transform(prepped_df[label])

prepped_df.head()

Unnamed: 0,marital_status,gender,total_children,education,member_card,occupation,houseowner,avg_cars_at_home_approx,avg_yearly_income,num_children_at_home,store_city,store_state,media_type,cost,promotion_name
0,0,0,1.0,4,2,4,1,1.0,0,1.0,5,1,3,126.62,0
1,0,1,0.0,0,3,3,1,4.0,5,0.0,5,1,3,59.86,6
2,1,0,4.0,4,2,2,0,1.0,0,0.0,5,1,3,84.16,18
3,0,0,2.0,2,0,2,1,2.0,4,2.0,5,1,5,95.78,6
4,0,1,0.0,4,0,4,0,2.0,4,0.0,5,1,7,50.79,11


In [20]:
# Defining the features
X = prepped_df.copy()
X = X.drop('member_card', axis=1)

# Defining the target set
y = prepped_df['member_card'].values

print(X.head())
print(y[:5])

   marital_status  gender  total_children  education  occupation  houseowner  \
0               0       0             1.0          4           4           1   
1               0       1             0.0          0           3           1   
2               1       0             4.0          4           2           0   
3               0       0             2.0          2           2           1   
4               0       1             0.0          4           4           0   

   avg_cars_at_home_approx  avg_yearly_income  num_children_at_home  \
0                      1.0                  0                   1.0   
1                      4.0                  5                   0.0   
2                      1.0                  0                   0.0   
3                      2.0                  4                   2.0   
4                      2.0                  4                   0.0   

   store_city  store_state  media_type    cost  promotion_name  
0           5            1 

In [21]:
# Splitting the data into test and train
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

# Checking the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(29169, 14)
(9723, 14)
(29169,)
(9723,)


In [22]:
# Scaling the data
# Creating scaler instance
scaler = StandardScaler()
# Fitrting it to the data
X_scaler = scaler.fit(X_train)

# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [23]:
# Fitting the model
from skmultilearn.problem_transform import LabelPowerset

classifier = LabelPowerset(
    classifier = RandomForestClassifier(),
    require_dense = [False, True]
)

classifier.fit(X_train_scaled, y_train)

LabelPowerset(classifier=RandomForestClassifier(), require_dense=[False, True])

In [24]:
# Making predictions!
predictions = classifier.predict(X_test_scaled)
predictions

<9723x1 sparse matrix of type '<class 'numpy.int64'>'
	with 4333 stored elements in List of Lists format>

In [25]:
predictions = predictions.toarray()
len(predictions)

9723

In [26]:
lp_f1=metrics.f1_score(y_test, predictions, average='micro')
lp_hamm=metrics.hamming_loss(y_test, predictions)
print('Label Powerset F1-score:',round(lp_f1,3))
print('Label Powerset Hamming Loss:',round(lp_hamm,3))

Label Powerset F1-score: 0.666
Label Powerset Hamming Loss: 0.334
