In [26]:
import pandas as pd

In [27]:
file = "Megastore Dataset.csv"
df = pd.read_csv(file)

In [28]:
df.tail()


Unnamed: 0,OrderID,ProductName,Quantity,InvoiceDate,UnitPrice,TotalCost,Country,DiscountApplied,OrderPriority,Region,Segment,ExpeditedShipping,PaymentMethod,CustomerOrderSatisfaction
8229,580263,DANISH ROSE DELUXE COASTER,12,12/2/2011 12:43,$0.85,$10.20,United States,No,High,Southeast,Corporate,Yes,PayPal,Dissatisfied
8230,580263,CACTI TLIGHT CANDLES,16,12/2/2011 12:43,$0.42,$6.72,United States,No,High,Southeast,Corporate,Yes,PayPal,Dissatisfied
8231,581316,RED RETROSPOT SUGAR JAM BOWL,1,12/8/2011 11:46,$2.55,$2.55,United States,No,High,Southeast,Consumer,No,Credit Card,Very Satisfied
8232,581316,GLASS SONGBIRD STORAGE JAR,1,12/8/2011 11:46,$12.50,$12.50,United States,No,High,Southeast,Consumer,No,Credit Card,Very Satisfied
8233,581316,REGENCY SUGAR BOWL GREEN,1,12/8/2011 11:46,$4.15,$4.15,United States,No,High,Southeast,Consumer,No,Credit Card,Very Satisfied


In [29]:
import numpy as np



df.columns = [c.strip() for c in df.columns] #removes white space so can avoid column names issue

# Ordinal encoding: OrderPriority (Medium < High)
priority_map = {"Medium": 1, "High": 2} #create a map dictionary for order priority
df["OrderPriority_ord"] = df["OrderPriority"].map(priority_map) # using priority map to map into new column in dataframe called OrderPriority_ord

# Ordinal encoding: CustomerOrderSatisfaction (increasing satisfaction)
satisfaction_map = { #create a map dictionary for CustomerOrderSatisfaction
    "Very Dissatisfied": 1,
    "Dissatisfied": 2,
    "Satisfied": 3,
    "Very Satisfied": 4,
    "Prefer not to answer": np.nan  #leaves the field blank for ordinal encoding
}
df["CustomerSatisfaction_ord"] = df["CustomerOrderSatisfaction"].map(satisfaction_map) #using satisfaction_map to map into new columnn in df called CustomerSatisfaction_ord

#one hot encoding for PaymentMethod and Segment, creates prefix to store either 1 or 0 for each option that will get a column
df_encoded = pd.get_dummies(
    df,
    columns=["PaymentMethod", "Segment"],
    prefix=["Pay", "Seg"],
    drop_first=False
)

df_encoded.to_csv("Megastore_Encoded.csv", index=False) #export dataframe to csv file

print("Saved: Megastore_Encoded.csv")
print(df_encoded[["OrderPriority", "OrderPriority_ord","CustomerOrderSatisfaction", "CustomerSatisfaction_ord"]].head()) #prints columns from df_encoded dataframe
print(df_encoded.filter(regex="^Pay_|^Seg_").head()) #prints all coulms in dataframe 


Saved: Megastore_Encoded.csv
  OrderPriority  OrderPriority_ord CustomerOrderSatisfaction  \
0          High                  2                 Satisfied   
1          High                  2                 Satisfied   
2          High                  2                 Satisfied   
3          High                  2                 Satisfied   
4          High                  2                 Satisfied   

   CustomerSatisfaction_ord  
0                       4.0  
1                       4.0  
2                       4.0  
3                       4.0  
4                       4.0  
   Pay_Credit Card  Pay_PayPal  Seg_Consumer  Seg_Corporate
0             True       False         False           True
1             True       False         False           True
2             True       False         False           True
3             True       False         False           True
4             True       False         False           True


In [30]:
df_encoded

Unnamed: 0,OrderID,ProductName,Quantity,InvoiceDate,UnitPrice,TotalCost,Country,DiscountApplied,OrderPriority,Region,ExpeditedShipping,CustomerOrderSatisfaction,OrderPriority_ord,CustomerSatisfaction_ord,Pay_Credit Card,Pay_PayPal,Seg_Consumer,Seg_Corporate
0,536370,INFLATABLE POLITICAL GLOBE,48,12/1/2010 8:45,$0.85,$40.80,United States,Yes,High,Northeast,Yes,Satisfied,2,4.0,True,False,False,True
1,536370,SET2 RED RETROSPOT TEA TOWELS,18,12/1/2010 8:45,$2.95,$53.10,United States,Yes,High,Northeast,Yes,Satisfied,2,4.0,True,False,False,True
2,536370,PANDA AND BUNNIES STICKER SHEET,12,12/1/2010 8:45,$0.85,$10.20,United States,Yes,High,Northeast,Yes,Satisfied,2,4.0,True,False,False,True
3,536370,RED TOADSTOOL LED NIGHT LIGHT,24,12/1/2010 8:45,$1.65,$39.60,United States,Yes,High,Northeast,Yes,Satisfied,2,4.0,True,False,False,True
4,536370,VINTAGE HEADS AND TAILS CARD GAME,24,12/1/2010 8:45,$1.25,$30.00,United States,Yes,High,Northeast,Yes,Satisfied,2,4.0,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8229,580263,DANISH ROSE DELUXE COASTER,12,12/2/2011 12:43,$0.85,$10.20,United States,No,High,Southeast,Yes,Dissatisfied,2,2.0,False,True,False,True
8230,580263,CACTI TLIGHT CANDLES,16,12/2/2011 12:43,$0.42,$6.72,United States,No,High,Southeast,Yes,Dissatisfied,2,2.0,False,True,False,True
8231,581316,RED RETROSPOT SUGAR JAM BOWL,1,12/8/2011 11:46,$2.55,$2.55,United States,No,High,Southeast,No,Very Satisfied,2,5.0,True,False,True,False
8232,581316,GLASS SONGBIRD STORAGE JAR,1,12/8/2011 11:46,$12.50,$12.50,United States,No,High,Southeast,No,Very Satisfied,2,5.0,True,False,True,False


In [31]:
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules
df = pd.read_csv("Megastore Dataset.csv")#import dataset into dataframe
basket = df.groupby(['OrderID'])['ProductName'].apply(lambda x:list(set(x))).reset_index() #create a basket with all products for an orderid by grouping
#removes duplicate products, converts to list format, resets OrderID back into normal column
transactions = basket['ProductName'].tolist() #converts Product name in basket into a list for transactions
#print(transactions[0])
#basket.to_csv("Transactional_Basket.csv", index=False)
from mlxtend.preprocessing import TransactionEncoder
te = TransactionEncoder() #Preparing for encoder binary matrix
te_array = te.fit(transactions).transform(transactions) #creates unique set list of Products like columns that it uses to add into array for binary
df_encoded = pd.DataFrame(te_array, columns=te.columns_).astype(bool) # converts array into Dataframe with Product names as column names, also creates into boolean types
df_encoded.to_csv("Transactional_Encoded.csv", index=False) #exports df_encoded dataframe into csv
frequent_itemsets = apriori(df_encoded, min_support=0.01, use_colnames=True) #runs apriori algorithm with minimum support threshold to return product names
print("Total Frequent Itemsets:", frequent_itemsets.shape[0])
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.1) # generates association rules for frequent_itemsets using confidence metric of at least 10% level
rules = rules[rules['antecedents'].apply(lambda x: len(x) >= 1) & rules['consequents'].apply(lambda x: len(x) >=1)] # makes sure consequents and antecedents are not empty
print("Association Rules:", rules.shape[0])
#rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(5)
#rules.to_csv("New_Market_Dataset.csv", index = False), 
sorted_rule = rules.sort_values(by='lift', ascending=False) #Sorting the rules by lift metric
top_3 = sorted_rule[['antecedents', 'consequents', 'support', 'confidence', 'lift']].head(3) #prints top 3 rules sorted by lift metric
top_3



Total Frequent Itemsets: 8340
Association Rules: 78532


Unnamed: 0,antecedents,consequents,support,confidence,lift
76397,"(ALARM CLOCK BAKELIKE RED , SPACEBOY BIRTHDAY ...","(CARD DOLLY GIRL , ROUND SNACK BOXES SET OF4 W...",0.011338,1.0,88.2
76649,"(SET6 RED SPOTTY PAPER PLATES, ALARM CLOCK BAK...","(SET6 RED SPOTTY PAPER CUPS, ROUND SNACK BOXES...",0.011338,1.0,88.2
4333,(SMALL DOLLY MIX DESIGN ORANGE BOWL),(SMALL MARSHMALLOWS PINK BOWL),0.011338,1.0,88.2
