In [4]:
import pandas as pd

df = pd.read_csv('boston.csv')

print(df.shape)
print(df.columns.tolist())

corr_matrix = df.corr()
print("Correlation matrix:")
print(corr_matrix)

print("correlation coefficient between RM and LSTAT:", df['RM'].corr(df['LSTAT']))
print("covariance normalized by n-1 between RM and LSTAT:", df['RM'].cov(df['LSTAT']))
print("covariance normalized by n between RM and LSTAT:", df['RM'].cov(df['LSTAT']) * (len(df) - 1) / len(df))

print("\nHighly correlated features (absolute correlation coefficient at least 0.8):")
highly_corr = corr_matrix.abs().stack().sort_values(ascending=False)
for (feature1, feature2), corr in highly_corr.iteritems():
    if feature1 != feature2 and corr >= 0.8:
        print(f"{feature1} and {feature2}: {corr:.2f}")


(506, 15)
['Unnamed: 0', 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'Price']
Correlation matrix:
            Unnamed: 0      CRIM        ZN     INDUS      CHAS       NOX  \
Unnamed: 0    1.000000  0.407407 -0.103393  0.399439 -0.003759  0.398736   
CRIM          0.407407  1.000000 -0.200469  0.406583 -0.055892  0.420972   
ZN           -0.103393 -0.200469  1.000000 -0.533828 -0.042697 -0.516604   
INDUS         0.399439  0.406583 -0.533828  1.000000  0.062938  0.763651   
CHAS         -0.003759 -0.055892 -0.042697  0.062938  1.000000  0.091203   
NOX           0.398736  0.420972 -0.516604  0.763651  0.091203  1.000000   
RM           -0.079971 -0.219247  0.311991 -0.391676  0.091251 -0.302188   
AGE           0.203784  0.352734 -0.569537  0.644779  0.086518  0.731470   
DIS          -0.302211 -0.379670  0.664408 -0.708027 -0.099176 -0.769230   
RAD           0.686002  0.625505 -0.311948  0.595129 -0.007368  0.611441   
TAX          

In [15]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, association_rules

# Load the datasets
order_items = pd.read_csv('olist_order_items_dataset.csv')
products = pd.read_csv('olist_products_dataset.csv')
translations = pd.read_csv('product_category_name_translation.csv')

# Merge to get the English names of products
products_merged = pd.merge(products, translations, on='product_category_name', how='left')
order_items_merged = pd.merge(order_items, products_merged[['product_id', 'product_category_name_english']], on='product_id', how='left')

# Drop rows where product name in English is NaN
order_items_merged.dropna(subset=['product_category_name_english'], inplace=True)

# Group by Order ID and list all products in the same order
transactions = order_items_merged.groupby('order_id')['product_category_name_english'].agg(lambda x: set(x)).reset_index()

# Convert the transactions into a one encoded dataframe
transactions['product_category_name_english'] = transactions['product_category_name_english'].apply(lambda x: list(x))
one_encoded = pd.get_dummies(transactions['product_category_name_english'].apply(pd.Series).stack()).groupby(level=0).sum().astype(bool)

# Apply Apriori algorithm 
def get_rules(support, confidence):
    frequent_itemsets = apriori(one_encoded, min_support=support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=confidence)
    return rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values('lift', ascending=False).head()

# Example thresholds
print("Rules for support=0.00001 and confidence=0.05")
print(get_rules(0.00001, 0.05))
print("\nRules for support=0.00002 and confidence=0.1")
print(get_rules(0.00002, 0.1))
print("\nRules for support=0.0000005 and confidence=0.5")
print(get_rules(0.0000005, 0.5))

Rules for support=0.00001 and confidence=0.05
                         antecedents                  consequents  support  \
21         (bed_bath_table, perfume)               (market_place)  0.00001   
24           (telephony, cool_stuff)                 (cine_photo)  0.00001   
8   (auto, fashion_bags_accessories)        (musical_instruments)  0.00001   
17    (bed_bath_table, garden_tools)  (construction_tools_lights)  0.00001   
7        (auto, musical_instruments)   (fashion_bags_accessories)  0.00001   

    confidence        lift  
21    1.000000  347.342857  
24    0.166667  249.374359  
8     1.000000  154.866242  
17    0.250000   99.647541  
7     1.000000   52.175966  

Rules for support=0.00002 and confidence=0.1
                   antecedents                 consequents   support  \
1  (fashion_childrens_clothes)  (fashion_bags_accessories)  0.000021   
0               (home_comfort)            (bed_bath_table)  0.000442   

   confidence       lift  
1    0.250000  13.043