# Product Analysis

- attempt to predict for customisation likelihood


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy

In [None]:
def find_classes(df):
    res_dict = {}
    for col in df.columns:
      try:
        res_dict[col+'_classes'] = [df[str(col)].unique()]
      except:
         print(f'Error with {col}')
         pass
    res_df = pd.DataFrame(data=res_dict)
    display(res_df)

In [None]:
data_path = 'survey_results.csv'
raw_df= pd.read_csv(data_path)

In [None]:
find_classes(raw_df)

In [None]:
raw_df.info()

In [None]:
print(raw_df.columns)

## 1. Selection


In [None]:
target_df = raw_df.copy()

## 2. Fusing and Cleansing


In [None]:
preprocessed_df = target_df.copy()

In [None]:
def check_percentage(df, col, show = False):
  counts = df[col].value_counts()
  percentages = (counts / counts.sum()) * 100
  for index, value in percentages.items():
    if value < 5 or show:
        print('_'*80)
        print(col)
        print('_'*80)
        print(percentages)
        break

### Rename Columns


In [None]:
rename_dict = {'Which age group do you belong to?': 'age_group', 
                'What is your gender?': 'gender', 
                "Which category do you currently belong to?": "car_ownership",
                "Which of the following best describes you?": "maritial_status",
                "Which of these factors are important to you when deciding which car to purchase?": "purchase_factors",
                "How likely are you to opt for customised vehicle if there were no extra charges? ": "customise_likelihood",
                "Which of the following exterior components would you choose to customise (texture, layout, size, etc)? ": "exterior_components",
                "Which of the following interior components would you choose to customise (texture, layout, size, etc)? ": "interior_components",
                "How much are you willing to spend on car customisation if surcharges are applicable?": "customise_spend",
                "Are you interested in designing your own components to personalise your car? ": "personalise_interest",
                "How much are you willing to pay for the personalised design? ": "personalise_spend",
                "Do you have any 3D design experience that would help with the design process? (e.g. AutoCAD, SolidWorks, Blender, etc)": "design_experience",
                "Please give us any design ideas to make the car uniquely Singaporean.": "design_ideas"
                }

preprocessed_df.rename(columns=rename_dict, inplace=True)
df = pd.DataFrame(rename_dict.items(), columns=["Initial", "Renamed"]) 
display(df)

### Dropping null values


In [None]:
preprocessed_df = preprocessed_df.dropna(subset=['exterior_components', 'purchase_factors'])
preprocessed_df = preprocessed_df[preprocessed_df['gender'] != 'Prefer not to say']
preprocessed_df.reset_index(drop=True, inplace=True)

### Creating Lists in Columns


In [None]:
preprocessed_df['purchase_factors'] = preprocessed_df['purchase_factors'].str.split(';')
preprocessed_df['exterior_components'] = preprocessed_df['exterior_components'].str.split(';')
preprocessed_df['interior_components'] = preprocessed_df['interior_components'].str.split(';')

### Merging similar data


In [None]:
# for col in preprocessed_df.columns:
#   check_percentage(preprocessed_df, col)

In [None]:
# preprocessed_df = preprocessed_df[preprocessed_df['customise_likelihood'] != 'Not likely']
# preprocessed_df.reset_index(drop=True, inplace=True)

In [None]:
preprocessed_df['customise_spend'] = preprocessed_df.apply(lambda row: 'under 500' if row['customise_spend'] == '100-500' else row['customise_spend'], axis=1)
# preprocessed_df['customise_spend'] = preprocessed_df.apply(lambda row: 'under 500' if row['customise_spend'] == '0' else row['customise_spend'], axis=1)

In [None]:
# preprocessed_df['personalise_spend'] = preprocessed_df.apply(lambda row: 'under 500' if row['personalise_spend'] == '0' else row['personalise_spend'], axis=1)

### Tersing Data


In [None]:
preprocessed_df['car_ownership'] = preprocessed_df.apply(lambda row: 'None' if row['car_ownership'] == 'Do not own a car, but planning to purchase in future' else row['car_ownership'], axis=1)
preprocessed_df['car_ownership'] = preprocessed_df.apply(lambda row: 'One' if row['car_ownership'] == 'Own a car' else row['car_ownership'], axis=1)
preprocessed_df['car_ownership'] = preprocessed_df.apply(lambda row: 'More than One' if row['car_ownership'] == 'Own more than one car' else row['car_ownership'], axis=1)

In [None]:
preprocessed_df['design_experience'] = preprocessed_df.apply(lambda row: 1 if row['design_experience'] == 'Yes, I can design on my own' else 0, axis=1)

### filling na


In [None]:
preprocessed_df['design_ideas'] = preprocessed_df['design_ideas'].fillna('none')

In [None]:
# preprocessed_df.head()

## 3. Data Transformation


In [None]:
transformed_df = preprocessed_df.copy()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [None]:
def expand_list_values(df, col):
    res_df = df.copy()
    dummy_df = pd.get_dummies(data=df[col].explode(), dtype=int, prefix=col, prefix_sep=':').groupby(level=0).sum()
    res_df = pd.concat([df, dummy_df], axis=1)
    # res_df = res_df.drop(columns=[col])
    return res_df
def expand_non_binary_values(df, col):
    res_df = df.copy()
    dummy_df = pd.get_dummies(data=df[col], dtype=int, prefix=col, prefix_sep=':').groupby(level=0).sum()
    res_df = pd.concat([df, dummy_df], axis=1)
    res_df = res_df.drop(columns=[col])
    return res_df
def encode_data_by_count(column_name, data_frame):
  print("_"*120)
  label_encoder = LabelEncoder()
  encoded_column_name = column_name+"_encoded"
  data_frame[encoded_column_name] = label_encoder.fit_transform(data_frame[column_name])
  legend = data_frame[[column_name, encoded_column_name]].copy()
  legend = legend.drop_duplicates().reset_index(drop=True)
  print(legend)
def encode_by_label(df, unique_values, col):
  print("_"*120)
  encoded_column_name = col+"_encoded"
  for i, val in enumerate(unique_values):
    if val not in df[col].unique():
      print(f"'{val}' not in {col}")
      return
  label_mapping = {val: i for i, val in enumerate(unique_values)}
  df[encoded_column_name] = df[col].map(label_mapping)
  legend = df[[col, encoded_column_name]].copy()
  legend = legend.drop_duplicates()
  legend = legend.sort_values(by=encoded_column_name)
  legend = legend.reset_index(drop=True)
  print(legend)

### Expanding Lists


In [None]:
transformed_df=expand_list_values(transformed_df, 'purchase_factors')
transformed_df=expand_list_values(transformed_df, 'exterior_components')
transformed_df=expand_list_values(transformed_df, 'interior_components')

In [None]:
transformed_df['design_ideas'] = preprocessed_df.apply(lambda row: 0 if row['design_ideas'] == 'none' else 1, axis=1)

In [None]:
find_classes(transformed_df)

### Encoding Data


In [None]:
encode_data_by_count('gender', transformed_df)
encode_data_by_count('maritial_status', transformed_df)

In [None]:
encode_by_label(transformed_df, ['0', 'under 500', '500-1000', 'over 1000'], 'customise_spend')
encode_by_label(transformed_df, ['0', 'under 500', '500-1000', 'over 1000'], 'personalise_spend')
encode_by_label(transformed_df, ['None', 'One', 'More than One'], 'car_ownership')
encode_by_label(transformed_df, ['Not likely', 'Likely', 'Very likely'], 'customise_likelihood')
encode_by_label(transformed_df, ['No', 'Only with professional help','Yes'], 'personalise_interest')
encode_by_label(transformed_df, ['20-30', '31-40', '41-50', '51-60'], 'age_group')

### Purchase DF

In [None]:
purchase_col_sel = [
  'gender_encoded',
  'maritial_status_encoded',
  'car_ownership_encoded',
  'age_group_encoded',
  'purchase_factors:Aesthetics',
  'purchase_factors:Brand name',
  'purchase_factors:Customisable options',
  'purchase_factors:Functionality',
  'purchase_factors:Price',
  'purchase_factors:Size',
  'purchase_factors:Sustainability/environment considerations',
  'purchase_factors:Technological features'
]
purchase_df = transformed_df[purchase_col_sel]

purchase_factors = transformed_df['purchase_factors'].explode().unique().tolist()
for i, val in enumerate(purchase_factors):
  purchase_factors[i] = 'purchase_factors:' + val

scaler = MinMaxScaler()
purchase_df = pd.DataFrame(data = scaler.fit_transform(purchase_df), columns = purchase_df.columns)

In [None]:
# reduced
reduced_purchase_df = purchase_df.copy()
pca = PCA()
pca.fit(reduced_purchase_df)

eigenvalues = pca.explained_variance_
prop_var = eigenvalues / np.sum(eigenvalues)

plt.figure(figsize=(14,10))
plt.plot(np.arange(1, len(prop_var)+1), 
                   prop_var, marker='o')
plt.xlabel('Principal Component',
           size = 20)
plt.ylabel('Proportion of Variance Explained',
           size = 20)
plt.title('Figure 1: Scree Plot for Proportion of Variance Explained for Purchase Data',
          size = 20)
plt.grid(True)

reduced_purchase_df = PCA(n_components=10).fit_transform(reduced_purchase_df)
reduced_purchase_df = pd.DataFrame(reduced_purchase_df)

#### Customise DF

In [None]:
# Data Selection
customise_col_sel = [
  'exterior_components:Brakes',
  'exterior_components:Bumpers',
  'exterior_components:Grilles',
  'exterior_components:Headlights',
  'exterior_components:Side mirrors',
  'exterior_components:Wheels',
  'exterior_components:add body kit and change the exhaust and tune the engine',
  'exterior_components:doors',
  'interior_components:Centre compartment',
  'interior_components:Dashboard',
  'interior_components:Door handles',
  'interior_components:Steering wheel',
  'interior_components:Sun blocker for front passengers',
  'interior_components:air vent', 
  'customise_spend_encoded',
  'personalise_spend_encoded'
]
customise_df = transformed_df[customise_col_sel]

interior_components = transformed_df['interior_components'].explode().unique().tolist()
for i, val in enumerate(interior_components):
  interior_components[i] = 'interior_components:' + val
exterior_components = transformed_df['exterior_components'].explode().unique().tolist()
for i, val in enumerate(exterior_components):
  exterior_components[i] = 'exterior_components:' + val

# Scaling Min MAx
scaler = MinMaxScaler()
customise_df = pd.DataFrame(data = scaler.fit_transform(customise_df), columns = customise_df.columns)

#

In [None]:
# reduced
reduced_customise_df = customise_df.copy()
reduced_customise_df.drop(columns=['customise_spend_encoded', 'personalise_spend_encoded'], inplace=True)
pca = PCA()
pca.fit(reduced_customise_df)

eigenvalues = pca.explained_variance_
prop_var = eigenvalues / np.sum(eigenvalues)

plt.figure(figsize=(14,10))
plt.plot(np.arange(1, len(prop_var)+1), 
                   prop_var, marker='o')
plt.xlabel('Principal Component',
           size = 20)
plt.ylabel('Proportion of Variance Explained',
           size = 20)
plt.title('Figure 1: Scree Plot for Proportion of Variance Explained for Customised Components Data',
          size = 20)
plt.grid(True)

reduced_customise_df = PCA(n_components=10).fit_transform(reduced_customise_df)
reduced_customise_df = pd.concat([customise_df.loc[:,['customise_spend_encoded', 'personalise_spend_encoded']], pd.DataFrame(reduced_customise_df)], axis=1)

In [None]:
asdfasdfda

### Dimension Reduction

In [None]:
select_columns = [col if 'encoded'in col else None for col in transformed_df.columns]
select_columns = [i for i in select_columns if i != None]

In [None]:
reduced_df = transformed_df.copy()
select_columns = [col if 'encoded'in col else None for col in transformed_df.columns]
select_columns += interior_components+exterior_components

for col in reduced_df.columns:
  if col not in select_columns:
    reduced_df.drop(col, axis=1, inplace=True)

df = reduced_df.copy()
scaler = MinMaxScaler()
df = pd.DataFrame(data = scaler.fit_transform(df), columns = df.columns)

pca = PCA()
pca.fit(df.drop(['customise_spend_encoded', 'personalise_spend_encoded', 'personalise_interest_encoded', 'customise_likelihood_encoded'], axis=1))

eigenvalues = pca.explained_variance_
prop_var = eigenvalues / np.sum(eigenvalues)

plt.figure(figsize=(14,10))
plt.plot(np.arange(1, len(prop_var)+1), 
                   prop_var, marker='o')
plt.xlabel('Principal Component',
           size = 20)
plt.ylabel('Proportion of Variance Explained',
           size = 20)
plt.title('Figure 1: Scree Plot for Proportion of Variance Explained for Customer',
          size = 20)
plt.grid(True)

reduced_df = PCA(n_components=12).fit_transform(reduced_df.drop('customise_spend_encoded', axis=1))
reduced_df = pd.concat([transformed_df['customise_spend_encoded'], pd.DataFrame(reduced_df)], axis=1)

display(reduced_df.head())

In [None]:
reduced_components_df = transformed_df.copy()

select_columns = ['customise_spend_encoded']+interior_components+exterior_components
for col in reduced_components_df.columns:
  if col not in select_columns:
    reduced_components_df.drop(col, axis=1, inplace=True)

df = reduced_components_df.copy()
scaler = MinMaxScaler()
df = pd.DataFrame(data = scaler.fit_transform(df), columns = df.columns)

pca = PCA()
pca.fit(df.drop('customise_spend_encoded', axis=1))

eigenvalues = pca.explained_variance_
prop_var = eigenvalues / np.sum(eigenvalues)

plt.figure(figsize=(14,10))
plt.plot(np.arange(1, len(prop_var)+1), 
                   prop_var, marker='o')
plt.xlabel('Principal Component',
           size = 20)
plt.ylabel('Proportion of Variance Explained',
           size = 20)
plt.title('Figure 1: Scree Plot for Proportion of Variance Explained for customised components',
          size = 20)
plt.grid(True)

reduced_components_df = PCA(n_components=10).fit_transform(reduced_components_df.drop('customise_spend_encoded', axis=1))
reduced_components_df = pd.concat([transformed_df['customise_spend_encoded'], pd.DataFrame(reduced_components_df)], axis=1)

display(reduced_components_df.head())

## 4. Data Mining


In [None]:
data=transformed_df.copy()
reduced_data = reduced_df.copy()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score,f1_score,precision_score,recall_score,roc_curve,roc_auc_score,confusion_matrix,classification_report
from arulespy.arules import Transactions, apriori, parameters, concat

In [None]:
def model_evaluation(model, X_train, X_test, y_train, y_test, print_results=True):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  if print_results:
    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(f"F1 Score: {f1_score(y_test, y_pred, average='weighted')}")
    print(f"Precision Score: \n{precision_score(y_test, y_pred, average='weighted')}")
    print(f"Recall Score: {recall_score(y_test, y_pred, average='weighted')}")
    print(f"Confusion Matrix: \n {confusion_matrix(y_test, y_pred)}")
    print(f"Classification Report: \n{classification_report(y_test, y_pred)}")
  # tp, fn, fp, tn = confusion_matrix(y_test,y_pred).reshape(-1)
  accuracy = accuracy_score(y_test, y_pred)
  f1 = f1_score(y_test, y_pred, average='weighted')
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  model_dict={'Model': [model.__class__.__name__], 'Accuracy': [round(accuracy,3)], 'F1_Score': [round(f1,3)], 'Precision': [round(precision,3)], 'Recall': [round(recall,3)]}
  try:
    model_df = pd.DataFrame(data=model_dict)
  except:
    print("Error creating model_df")
    print(model_dict)
    return
  return model, model_df

In [None]:
models = [LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(), ExtraTreesClassifier(), KNeighborsClassifier(), GaussianNB()]

### Data Visualisation


In [None]:
fig, ax = plt.subplots(figsize=(30, 30))
plt.axis('off')
select_columns = ['age_group', 'gender',
                  'car_ownership', 'maritial_status', 
                  'customise_likelihood', 'customise_spend', 
                  'personalise_interest', 'personalise_spend',
                  'design_experience', 'design_ideas']
for i, col in enumerate(select_columns):
  plt.subplot(5, 2, i+1)
  sns.countplot(data=data, y=col, orient='h')#, hue='customise_spend_encoded')
  plt.title(col)
  plt.xlim(0,50)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(20, 20))
plt.axis('off')
select_columns = ['purchase_factors', 
                  'interior_components', 
                  'exterior_components']
for i, col in enumerate(select_columns):
  plt.subplot(3, 1, i+1)
  sns.barplot(data=data[col].explode().value_counts(),orient='h')
  plt.title(col)
  plt.xlim(0,50)
plt.show()

### Correlation Analysis


In [None]:
test_data = data.copy()
dropped_columns = [ 'age_group', 'gender',
                    'car_ownership', 'maritial_status', 
                    'customise_likelihood', 'customise_spend', 
                    'personalise_interest', 'personalise_spend',
                    'design_experience', 'design_ideas', 
                    'purchase_factors', 'interior_components', 
                    'exterior_components'
                    ]
test_data.drop(dropped_columns, axis=1, inplace=True)
corr = test_data.corr()

In [None]:
features_dict = {}
customise_features = []
for val in corr['customise_spend_encoded']:
  factor = 0.2
  if val > factor or val < -factor:
    features_dict[corr['customise_spend_encoded'][corr['customise_spend_encoded'] == val].index[0]] = [val]
    customise_features.append(corr['customise_spend_encoded'][corr['customise_spend_encoded'] == val].index[0])
display(pd.DataFrame(data=features_dict).transpose().rename(columns={0:'pearson_correlation to customise_spend'}).sort_values(by='pearson_correlation to customise_spend', ascending=False))

In [None]:
features_dict = {}
personalise_features = []
for val in corr['personalise_spend_encoded']:
  factor = 0.2
  if val > factor or val < -factor:
    features_dict[corr['personalise_spend_encoded'][corr['personalise_spend_encoded'] == val].index[0]] = [val]
    personalise_features.append(corr['personalise_spend_encoded'][corr['personalise_spend_encoded'] == val].index[0])
display(pd.DataFrame(data=features_dict).transpose().rename(columns={0:'pearson_correlation to personalise_spend'}).sort_values(by='pearson_correlation to personalise_spend', ascending=False))

from the correlation matrix, we can conclude that there is a strong positive correlation between the amount one is willing to spend  
on a customisation and:

- exterior_components:Wheels
- gender_encoded
- interior_components:Steering wheel
- purchase_factors:Sustainability/environment considerations

</br>
Therefore, you may increase these aspects to allow for more revenue


### Classification of data


In [None]:
modeled_data = data.copy()
for col in modeled_data.columns:
  if col not in customise_features:
    if col == 'customise_spend_encoded':
      continue
    modeled_data.drop(col, axis=1, inplace=True)
modeled_data.drop('personalise_spend_encoded', axis=1, inplace=True)
# modeled_data.drop('customise_likelihood_encoded', axis=1, inplace=True)
# modeled_data.drop('personalise_interest_encoded', axis=1, inplace=True)

In [None]:
y = modeled_data['customise_spend_encoded']
X = modeled_data.drop(['customise_spend_encoded'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
res = pd.DataFrame(columns=['Model', 'Accuracy', 'F1_Score', 'Precision', 'Recall'])
customised_vehicle_likelihood_model = {}
for model in models:
  model_res, model_df = model_evaluation(model, X_train, X_test, y_train, y_test, False)
  res = pd.concat([res, model_df], ignore_index=True)
  customised_vehicle_likelihood_model[model.__class__.__name__] = model_res
  del model_res
  del model_df
res = res.sort_values(by='Accuracy', ascending=False)
res = res.reset_index(drop=True)
display(res)

### Clustering reduced_df


In [None]:
scaler = MinMaxScaler()
df = reduced_df.copy()
df.drop('customise_spend_encoded', axis=1, inplace=True)
df = pd.DataFrame(data = scaler.fit_transform(df), columns = df.columns)

reduced_df_X = df
reduced_df_y = reduced_df['customise_spend_encoded']

X_train, X_test, y_train, y_test = train_test_split(reduced_df_X, reduced_df_y, test_size=0.3, random_state=42)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(reduced_df_X.to_numpy())

u_labels = np.unique(clusters)
 
fig = plt.figure(figsize=(10,10))
for i in u_labels:
    plt.scatter(reduced_df_X.to_numpy()[clusters == i , 0] , reduced_df_X.to_numpy()[clusters == i , 1] , label = i)
plt.title('KMeans Clustering on Reduced Data')
plt.show()

### Classification of reduced_df


In [None]:
res = pd.DataFrame(columns=['Model', 'Accuracy', 'F1_Score', 'Precision', 'Recall'])
customised_vehicle_likelihood_model = {}
for model in models:
  model_res, model_df = model_evaluation(model, X_train, X_test, y_train, y_test, False)
  res = pd.concat([res, model_df], ignore_index=True)
  customised_vehicle_likelihood_model[model.__class__.__name__] = model_res
  del model_res
  del model_df
res = res.sort_values(by='Accuracy', ascending=False)
res = res.reset_index(drop=True)
display(res)

### Clustering on reduced_components


In [None]:
scaler = MinMaxScaler()
df = reduced_components_df.copy()
df.drop('customise_spend_encoded', axis=1, inplace=True)
df = pd.DataFrame(data = scaler.fit_transform(df), columns = df.columns)

reduced_components_df_X = df
reduced_components_df_y = reduced_df['customise_spend_encoded']

X_train, X_test, y_train, y_test = train_test_split(reduced_components_df_X, reduced_components_df_y, test_size=0.3, random_state=42)

In [None]:
kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(reduced_components_df_X.to_numpy())

u_labels = np.unique(clusters)
fig = plt.figure(figsize=(10,10))
for i in u_labels:
    plt.scatter(reduced_components_df_X.to_numpy()[clusters == i , 0] , reduced_components_df_X.to_numpy()[clusters == i , 1] , label = i)
plt.title('KMeans Clustering on Reduced Customised Components')
plt.show()

### Classification on reduced_components


In [None]:
res = pd.DataFrame(columns=['Model', 'Accuracy', 'F1_Score', 'Precision', 'Recall'])
customised_vehicle_likelihood_model = {}
for model in models:
  model_res, model_df = model_evaluation(model, X_train, X_test, y_train, y_test, False)
  res = pd.concat([res, model_df], ignore_index=True)
  customised_vehicle_likelihood_model[model.__class__.__name__] = model_res
  del model_res
  del model_df
res = res.sort_values(by='Accuracy', ascending=False)
res = res.reset_index(drop=True)
display(res)

## Association Rules on Products


In [None]:
purchase_factors_df = data.loc[:, purchase_factors]
for col in purchase_factors_df.columns:
  purchase_factors_df[col] = purchase_factors_df.apply(lambda row: True if row[col] == 1 else False, axis=1)

trans = Transactions.from_df(purchase_factors_df)

rules = apriori(trans,
                    parameter = parameters({"supp": 0.1, "conf": 0.8}), 
                    control = parameters({"verbose": False}))  
rules_df = rules.as_df()
rules_df = rules_df.sort_values(by='lift', ascending=False)
rules_df.to_csv('purchase_factors_rules.csv')
display(rules_df)

In [None]:
exterior_components_df = data.loc[:, exterior_components]
for col in exterior_components_df.columns:
  exterior_components_df[col] = exterior_components_df.apply(lambda row: True if row[col] == 1 else False, axis=1)

trans = Transactions.from_df(exterior_components_df)

rules = apriori(trans,
                    parameter = parameters({"supp": 0.1, "conf": 0.8}), 
                    control = parameters({"verbose": False}))  
rules_df = rules.as_df()
rules_df = rules_df.sort_values(by='lift', ascending=False)
rules_df.to_csv('exterior_components_rules.csv')
display(rules_df['RHS'].value_counts().explode())

In [None]:
interior_components_df = data.loc[:, interior_components]
for col in interior_components_df.columns:
  interior_components_df[col] = interior_components_df.apply(lambda row: True if row[col] == 1 else False, axis=1)

trans = Transactions.from_df(interior_components_df)

rules = apriori(trans,
                    parameter = parameters({"supp": 0.1, "conf": 0.8}), 
                    control = parameters({"verbose": False}))  
rules_df = rules.as_df()
rules_df = rules_df.sort_values(by='lift', ascending=False)
rules_df.to_csv('interior_components_rules.csv')
display(rules_df['RHS'].value_counts().explode())

In [None]:
components_df = data.loc[:, interior_components+exterior_components]
for col in components_df.columns:
  components_df[col] = components_df.apply(lambda row: True if row[col] == 1 else False, axis=1)

trans = Transactions.from_df(components_df)

rules = apriori(trans,
                    parameter = parameters({"supp": 0.1, "conf": 0.8}), 
                    control = parameters({"verbose": False}))  
rules_df = rules.as_df()
rules_df = rules_df.sort_values(by='lift', ascending=False)
rules_df.to_csv('components_df_rules.csv')
display(rules_df['RHS'].value_counts().explode())

In [None]:
arules_df = data.copy()
select_columns = ['customise_spend_encoded']+interior_components+exterior_components
for col in arules_df.columns:
  if col not in select_columns:
    arules_df.drop(col, axis=1, inplace=True)

trans = Transactions.from_df(arules_df)

rules = apriori(trans,
                    parameter = parameters({"supp": 0.1, "conf": 0.8}), 
                    control = parameters({"verbose": False}))  
rules_df = rules.sort(by = 'lift').as_df()
customise_spend_df = rules_df.loc[rules_df['RHS'] == '{customise_spend_encoded=[2,3]}']
customise_spend_df.to_csv('customise_spend_rules.csv')
display(customise_spend_df)