In [4]:
from csv import reader
from collections import defaultdict
from itertools import chain, combinations
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import shuffle

In [54]:
class Node:
    def __init__(self, itemName, frequency, parentNode):
        self.itemName = itemName
        self.count = frequency
        self.parent = parentNode
        self.children = {}
        self.next = None

    def display(self, ind=1):
        print('  ' * ind, self.itemName, ' ', self.count)
        for child in list(self.children.values()):
            child.display(ind)


class Trie:
    def __init__(self, minSup):
        self.root = Node(None, None, None)
        self.minSup = minSup
        self.headerTable = {}

    def addTransaction(self, transaction):
        currentNode = self.root
        for item in transaction:
            if item in currentNode.children:
                currentNode.children[item].count += 1
                self.headerTable[item].append(currentNode.children[item])
            else:
                currentNode.children[item] = Node(item, 1, currentNode)
                if item in self.headerTable:
                    self.headerTable[item].append(currentNode.children[item])
                else:
                    self.headerTable[item] = [currentNode.children[item]]
            currentNode = currentNode.children[item]

    def getMinSupItemSet(self):
        minSupItemSet = set()
        for item in self.headerTable:
            if(max([node.count for node in self.headerTable[item]]) >= self.minSup):
                minSupItemSet.add(frozenset([item]))
        return minSupItemSet

    def printTree(self):
        self.root.display()




In [19]:

movies_df= pd.read_csv('movies.csv')
ratings_df= pd.read_csv('ratings.csv')

user_ratings_counts = ratings_df['userId'].value_counts()
active_users = user_ratings_counts[user_ratings_counts > 10].index
filtered_ratings_df=ratings_df[ratings_df['rating'] > 2]

filtered_ratings_df = filtered_ratings_df[filtered_ratings_df['userId'].isin(active_users)]
transactional_data = filtered_ratings_df.groupby('userId')['movieId'].apply(list).reset_index()

transactions=transactional_data['movieId'].tolist()
for transaction in transactions:
    shuffle(transaction)

transactions[0]

[596,
 2899,
 1644,
 2450,
 2353,
 1278,
 3273,
 1256,
 316,
 333,
 3034,
 2406,
 423,
 4006,
 1517,
 260,
 157,
 216,
 235,
 1049,
 1617,
 1029,
 2094,
 3062,
 3247,
 2427,
 3702,
 2616,
 2161,
 1030,
 1620,
 3386,
 804,
 2329,
 362,
 736,
 1127,
 3253,
 592,
 151,
 223,
 2648,
 1954,
 1224,
 1291,
 2761,
 2470,
 1031,
 101,
 1927,
 590,
 3729,
 1198,
 553,
 543,
 3441,
 2273,
 349,
 2366,
 1222,
 2947,
 2692,
 923,
 3489,
 1258,
 1097,
 457,
 1270,
 1408,
 1089,
 2797,
 2291,
 2993,
 2459,
 2078,
 2985,
 2268,
 2414,
 47,
 2005,
 3168,
 2193,
 1136,
 500,
 50,
 2105,
 231,
 480,
 2596,
 356,
 3243,
 1625,
 943,
 527,
 2174,
 296,
 2096,
 1967,
 2991,
 2948,
 1,
 1092,
 3639,
 2716,
 3033,
 2090,
 3671,
 1213,
 1298,
 2478,
 3578,
 1025,
 1445,
 1920,
 1220,
 2492,
 2640,
 780,
 2997,
 163,
 1080,
 2018,
 367,
 2657,
 3450,
 1023,
 1377,
 3448,
 2058,
 1396,
 2987,
 2641,
 3439,
 1009,
 1206,
 2654,
 1073,
 1265,
 673,
 3809,
 2012,
 2143,
 3527,
 2502,
 2700,
 2046,
 1348,
 2644,
 18

In [60]:
div =0.8

f1= open("train.csv","w")
f2= open("test.csv","w")

for transaction in transactions:
    length= int(len(transaction)*0.8)
    for item in transaction[:length]:
        if item == transaction[length-1]:
            f1.write(str(item))
        else:
            f1.write(str(item)+",")
    f1.write("\n")
    for item in transaction[length:]:
        if item == transaction[-1]:
            f2.write(str(item))
        else:
            f2.write(str(item)+",")
    f2.write("\n")

f1.close()
f2.close()

In [125]:
f1= open("train.csv","r")

train_data=[]

for line in f1:
    line=line.strip()
    line=line.split(",")
    train_data.append(line)

len(train_data)

print(len(train_data[0]))
print(train_data[0][0])
# get counts of items
itemCounts = defaultdict(int)
for transaction in train_data:
    for item in transaction:
        itemCounts[item] += 1

def sort_transaction(transaction):
    return sorted(transaction, key=lambda x: itemCounts[x], reverse=True)

for i in range(len(train_data)):
    train_data[i] = sort_transaction(train_data[i])

for transaction in train_data:
    for i in range(len(transaction)-1,0,-1):
        if itemCounts[transaction[i]] < 50:
            transaction.pop()
    

trie = Trie(500)
train_data

for transaction in train_data:
    trie.addTransaction(transaction)

root= trie.root
print(len(root.children))





180
596
62
