In [2]:
import pandas as pd
import numpy as np
from pct.tree.heuristic.Heuristic import Heuristic5
from pct.tree.heuristic.NumericHeuristic import NumericHeuristic5
from pct.tree.splitter.splitter import Splitter
from pct.tree.tree import Tree
from pct.tree.ftest.ftest import FTest




## From YXH's data process

In [3]:

from sklearn.preprocessing import LabelEncoder

# Load the u.data dataset
u_data = pd.read_csv('ml-100k/u.data', sep='\t', header=None, names=['user_id', 'item_id', 'rating', 'timestamp'])

# Perform label encoding on user_id and item_id
user_encoder = LabelEncoder()
item_encoder = LabelEncoder()

u_data['user_id'] = user_encoder.fit_transform(u_data['user_id'])
u_data['item_id'] = item_encoder.fit_transform(u_data['item_id'])

# Create the rating matrix
n_users = u_data['user_id'].nunique()
n_items = u_data['item_id'].nunique()

# Define the threshold
THRESHOLD = 3  # Ratings above this are "Lovers", below are "Haters"

# Create a user-item matrix with users as rows and items as columns, and fill missing values with 0
rating_matrix = u_data.pivot(index='user_id', columns='item_id', values='rating')

# Apply threshold classification to the rating matrix (after thresholding)
rating_matrix_thresholded = np.where(rating_matrix > THRESHOLD, 1, np.where(rating_matrix > 0, -1, 0))

# Convert numpy array to pandas DataFrame for easier inspection
rating_matrix_thresholded_df = pd.DataFrame(rating_matrix_thresholded)


# Create rI and rU indexes for training set (R)
# rI: item to user ratings (dictionary of items with lists of user ratings)
rI = {}
for _, row in u_data.iterrows():
    item_id = row['item_id']
    user_id = row['user_id']
    rating = row['rating']
    if item_id not in rI:
        rI[item_id] = []
    rI[item_id].append((user_id, rating))

# rU: user to item ratings (dictionary of users with lists of item ratings)
rU = {}
for _, row in u_data.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']
    rating = row['rating']
    if user_id not in rU:
        rU[user_id] = []
    rU[user_id].append((item_id, rating))

In [None]:
rating_matrix

item_id,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
1,4.0,,,,,,,,,2.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,,,,,,,,,5.0,,...,,,,,,,,,,
939,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
940,5.0,,,,,,4.0,,,,...,,,,,,,,,,
941,,,,,,,,,,,...,,,,,,,,,,


In [None]:
rating_matrix_thresholded_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
0,1,-1,1,-1,-1,1,1,-1,1,-1,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,-1,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,-1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Optionally, print rI and rU to check the data structure for training and testing
print("\nSample rI (item to user ratings) for Training Set:")
for item in list(rI.keys())[:2]:  # Display sample item indices
    print(f"Item {item}: {rI[item]}")

print("\nSample rU (user to item ratings) for Training Set:")
for user in list(rU.keys())[:2]:  # Display sample user indices
    print(f"User {user}: {rU[user]}")


Sample rI (item to user ratings) for Training Set:
Item 241: [(np.int64(195), np.int64(3)), (np.int64(62), np.int64(3)), (np.int64(225), np.int64(5)), (np.int64(153), np.int64(3)), (np.int64(305), np.int64(5)), (np.int64(295), np.int64(4)), (np.int64(33), np.int64(5)), (np.int64(270), np.int64(4)), (np.int64(200), np.int64(4)), (np.int64(208), np.int64(4)), (np.int64(34), np.int64(2)), (np.int64(353), np.int64(5)), (np.int64(198), np.int64(5)), (np.int64(112), np.int64(2)), (np.int64(0), np.int64(5)), (np.int64(172), np.int64(5)), (np.int64(359), np.int64(4)), (np.int64(233), np.int64(4)), (np.int64(13), np.int64(4)), (np.int64(308), np.int64(4)), (np.int64(330), np.int64(4)), (np.int64(20), np.int64(3)), (np.int64(110), np.int64(4)), (np.int64(438), np.int64(5)), (np.int64(354), np.int64(4)), (np.int64(203), np.int64(5)), (np.int64(144), np.int64(5)), (np.int64(29), np.int64(5)), (np.int64(462), np.int64(2)), (np.int64(143), np.int64(4)), (np.int64(416), np.int64(3)), (np.int64(1), n

## initiate tree and splitter

In [6]:
tree = Tree(min_instances=5)
tree.fit(rating_matrix, u_data['rating'], target_weights=None, rI=rI, rU=rU)




✅ Converted x and y to DataFrame
✅ Assigned x and y
✅ Creating target weights...
✅ Identifying numerical and categorical attributes...
✅ Creating Splitter...
Initializing Splitter...
✅ Calling build()...
🌲 Building predictive clustering tree...
🔍 Finding best split item...


  return reduction(axis=axis, out=out, **passkwargs)


🍃 Creating leaf node...
🔍 Retrieving users who rated the best item...
🔍 Classifying users into three groups...
🔍 Ensuring 'unknowns' ...
🔍 Retrieving data subsets for each group...
🔍 Retrieving instance weights for each group...
🔍 Recursively building the tree for each group...
🌲 Building predictive clustering tree...
🔍 Finding best split item...
🍃 Creating leaf node...
🌲 Building predictive clustering tree...
🔍 Finding best split item...
🍃 Creating leaf node...
🌲 Building predictive clustering tree...
🔍 Finding best split item...
🍃 Creating leaf node...
✅ Tree built successfully!


  prototype /= np.sum(


<pct.tree.tree.Tree at 0x7f888818a670>

In [13]:
tree.print_tree_structure()

Node: 246_1
Attribute: 246
Value: [0.]
Criterion: None
Leaf Node: No
Children:
    Node: leaf_2=[5.] (1.0)
    Attribute: None
    Value: None
    Criterion: None
    Leaf Node: Yes
    Prediction: [5.]
    Node: leaf_3=[3.75] (4.0)
    Attribute: None
    Value: None
    Criterion: None
    Leaf Node: Yes
    Prediction: [3.75]
    Node: leaf_4=[nan] (0.0)
    Attribute: None
    Value: None
    Criterion: None
    Leaf Node: Yes
    Prediction: [nan]
