In [1]:
import pandas as pd 
import numpy as np 
import math as math
import joblib
from joblib import dump
import os

## LOAD DATA

In [2]:
def load_Dataset(baseFile, fold_Number): 
    rnmColData = ['user_id', 'item_id', 'rating', 'timestamp']
    base_File = f"{baseFile}/u{fold_Number}.base"
    test_File = f"{baseFile}/u{fold_Number}.test"
    base_Data = pd.read_csv(base_File, sep="\t", header=None, names=rnmColData)
    test_Data = pd.read_csv(test_File, sep="\t", header=None, names=rnmColData)
    base_Data = base_Data.drop(columns=["timestamp"])
    test_Data = test_Data.drop(columns=["timestamp"])
    return base_Data, test_Data

In [3]:
call_base = "ml-100k"
basedata, testdata = load_Dataset(call_base, 1)
basedata

Unnamed: 0,user_id,item_id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3
...,...,...,...
79995,943,1067,2
79996,943,1074,4
79997,943,1188,3
79998,943,1228,3


## CONVERT RATING MATRIKS

In [4]:
def ConvertRatingMatriks(ratingData):
    # membuat container untuk rating matriks dengan ukuran user x item
    matriks_rating = pd.DataFrame(np.zeros((943, 1682)), columns = list(range(1, 1683)), index = list(range(1, 944)))
    # merubah data frame ke dalam bentuk matriks rating pivot
    convertMatriksRating = ratingData.pivot_table(index = 'user_id', columns = 'item_id', values='rating')
    # mengisi matriks rating NaN dengan 0
    matriks_rating = convertMatriksRating.fillna(0)
    # updating matriks rating dengan matriks rating yang sudah di pivot
    matriks_rating.update(convertMatriksRating)
    # mengembalikan matriks rating
    return matriks_rating

## BASE MODEL

In [5]:
rating_Matriks = ConvertRatingMatriks(basedata)
rating_Matriks

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### MEAN 

In [6]:
def meanRating(RatingMatriks, jenis="user-based"):
    if jenis == "user-based":
        axis = 1
        index = RatingMatriks.index
    elif jenis == "item-based":
        axis = 0
        index = RatingMatriks.columns

    pembilang = RatingMatriks.sum(axis=axis)
    penyebut = np.count_nonzero(RatingMatriks, axis=axis)
    calculateMeanRating = np.where(penyebut == 0, 0, pembilang / penyebut)

    meanRatingNew = pd.DataFrame(calculateMeanRating, index=index, columns=["meanRating"])
    return meanRatingNew


#### USER

In [7]:
cal_MeanRatingUser = meanRating(rating_Matriks, jenis="user-based")
cal_MeanRatingUser

Unnamed: 0_level_0,meanRating
user_id,Unnamed: 1_level_1
1,3.681481
2,3.800000
3,3.000000
4,4.357143
5,2.956044
...,...
939,4.265306
940,3.457944
941,4.045455
942,4.265823


#### ITEM

In [8]:
cal_MeanRatingItem = meanRating(rating_Matriks, jenis="item-based")
cal_MeanRatingItem

Unnamed: 0_level_0,meanRating
item_id,Unnamed: 1_level_1
1,3.892950
2,3.180952
3,3.000000
4,3.526316
5,3.304348
...,...
1678,1.000000
1679,3.000000
1680,2.000000
1681,3.000000


### MEAN-CENTERED

In [9]:
def meanCenteredRating(RatingMatriks, meanRating, jenis="user-based"):
    npRatingMatriks = np.array(RatingMatriks)
    if jenis == "user-based":
        # Reshape meanRating menjadi (jumlah_user, 1)
        npMeanRating = np.array(meanRating).reshape(-1, 1)  # (943, 1)
    elif jenis == "item-based":
        # Reshape meanRating menjadi (1, jumlah_item)
        npMeanRating = np.array(meanRating).reshape(1, -1)  # (1, 1650)
    # Menghitung mean centered
    meanCentered = np.where(npRatingMatriks != 0, npRatingMatriks - npMeanRating, 0)
    # Mengembalikan dalam bentuk DataFrame
    dfMeanCentered = pd.DataFrame(meanCentered, index=RatingMatriks.index, columns=RatingMatriks.columns)
    return dfMeanCentered

#### USER

In [10]:
cal_MeanCenteredUser = meanCenteredRating(rating_Matriks, cal_MeanRatingUser, jenis="user-based")
cal_MeanCenteredUser

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.318519,-0.681481,0.318519,-0.681481,-0.681481,0.0,0.318519,-2.681481,1.318519,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.200000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,-1.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.734694,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.000000,0.000000,0.000000,-1.457944,0.000000,0.0,0.542056,1.542056,-0.457944,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,0.954545,0.000000,0.000000,0.000000,0.000000,0.0,-0.045455,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### ITEM

In [11]:
cal_MeanCenteredItem = meanCenteredRating(rating_Matriks, cal_MeanRatingItem, jenis="item-based")
cal_MeanCenteredItem

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.10705,-0.180952,1.0,-0.526316,-0.304348,0.0,0.201954,-2.99422,1.166667,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.10705,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,-1.876712,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,1.166667,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.00000,0.000000,0.0,-1.526316,0.000000,0.0,0.201954,1.00578,-0.833333,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,1.10705,0.000000,0.0,0.000000,0.000000,0.0,0.201954,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.00000,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.00000,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### SIMILARITY

In [None]:
def simRJ(RatingMatriks, index1, index2, jenis="user-based"):
    if jenis == "user-based":
        itemRated1 = set(np.where(RatingMatriks.loc[index1, :] != 0)[0])
        itemRated2 = set(np.where(RatingMatriks.loc[index2, :] != 0)[0])
    elif jenis == "item-based":
        itemRated1 = set(np.where(RatingMatriks.loc[:, index1] != 0)[0])
        itemRated2 = set(np.where(RatingMatriks.loc[:, index2] != 0)[0])
    
    intersection = len(itemRated1.intersection(itemRated2))
    iItemU = len(itemRated1)
    iItemV = len(itemRated2)

    # menghitung similarity
    if intersection != 0:
        rumusSimRJItem = 1/(1+(1/intersection)+(iItemU/1+iItemU)+(iItemV/1+iItemV))
    elif intersection == 0:
        rumusSimRJItem = 0
    return rumusSimRJItem
    

#### ITEM

##### RJ

In [17]:
def simRJItem(rating, itemU, itemV):
    # mendapatkan pengguna yang memberi rating pada itemU dan itemV
    userRatedItemU = set(np.where(rating.loc[:, itemU] != 0)[0])
    userRatedItemV = set(np.where(rating.loc[:, itemV] != 0)[0])
    
    # mencari irisan dari pengguna yang memberi rating pada itemU dan itemV
    intersection = len(userRatedItemU.intersection(userRatedItemV))
    iItemU = len(userRatedItemU)
    iItemV = len(userRatedItemV)

    # menghitung similarity
    if intersection != 0:
        rumusSimRJItem = 1/(1+(1/intersection)+(iItemU/1+iItemU)+(iItemV/1+iItemV))
    elif intersection == 0:
        rumusSimRJItem = 0
    
    return rumusSimRJItem

In [19]:
# fungi untuk menghitung semua user

def similarityRJItem2(rating):
    jumlahItem = rating.shape[1]

    # membuat matriks kosong untuk menyimpan hasil similarity
    simMatriks = np.zeros((jumlahItem, jumlahItem))
    # menghitung similarity antar pengguna
    for i in range(jumlahItem):
        for j in range(jumlahItem):
            if i == j:
                simMatriks[i][j] = simRJItem(rating, rating.columns[i], rating.columns[j])
            elif i != j :
                simMatriks[i][j] = simRJItem(rating, rating.columns[i], rating.columns[j])

    
    # mengubah matriks similarity menjadi dataframe
    simMatriksDf = pd.DataFrame(simMatriks, index=rating.columns, columns=rating.columns)

    return simMatriksDf

In [20]:
calRJALLItem2 = similarityRJItem2(rating_Matriks)
calRJALLItem2

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.000652,0.001024,0.001090,0.000902,0.001105,0.001239,0.000724,0.000898,0.000794,0.001095,...,0.001299,0.0,0.00000,0.00000,0.001299,0.000000,0.000000,0.000000,0.001299,0.001299
2,0.001024,0.002375,0.002770,0.001808,0.002865,0.003981,0.001212,0.001795,0.001422,0.002800,...,0.000000,0.0,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.004673,0.004673
3,0.001090,0.002770,0.003322,0.002028,0.003459,0.005229,0.001307,0.002012,0.001555,0.003366,...,0.000000,0.0,0.00000,0.00000,0.006494,0.000000,0.000000,0.000000,0.000000,0.006494
4,0.000902,0.001808,0.002028,0.001460,0.002079,0.002610,0.001045,0.001451,0.001198,0.002045,...,0.000000,0.0,0.00289,0.00289,0.002890,0.000000,0.000000,0.000000,0.002890,0.002890
5,0.001105,0.002865,0.003459,0.002079,0.003610,0.005556,0.001328,0.002062,0.001585,0.003506,...,0.000000,0.0,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.007042
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,0.166667,0.166667,0.166667,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,0.166667,0.166667,0.166667,0.000000,0.000000
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,0.166667,0.166667,0.166667,0.000000,0.000000
1681,0.001299,0.004673,0.000000,0.002890,0.000000,0.000000,0.001618,0.002857,0.002016,0.000000,...,0.000000,0.0,0.00000,0.00000,0.000000,0.000000,0.000000,0.000000,0.166667,0.000000


#### save modal sim

In [21]:
saveSimilarityItem = calRJALLItem2.to_csv("folds/simi/RJ/item/similarityRJItemFolds1.csv", index=True, header=True)
# saveSimilarityItem

# Save the similarity DataFrame to a file using joblib
joblib.dump(calRJALLItem2, 'folds/simi/RJ/item/similarityJRJItemFolds1.pkl')
# Load the similarity DataFrame from the file

['folds/simi/RJ/item/similarityJRJItemFolds1.pkl']

### TOP-K

In [28]:
#use .pkl to find topK

similarityPkl = joblib.load('folds/simi/jaccard/user/similarityJaccardFolds1.pkl')
similarityPkl

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.035503,0.018750,0.013605,0.102439,0.155660,0.101266,0.031250,0.013793,0.106280,...,0.136029,0.041916,0.099206,0.060606,0.080000,0.039548,0.115207,0.046667,0.070000,0.130597
2,0.035503,1.000000,0.046154,0.038462,0.007692,0.094891,0.032653,0.014493,0.061224,0.038760,...,0.033816,0.161765,0.144654,0.230769,0.138462,0.098765,0.088889,0.087719,0.062500,0.034826
3,0.018750,0.046154,1.000000,0.076923,0.008475,0.045455,0.008368,0.054545,0.000000,0.025210,...,0.015075,0.015152,0.055901,0.062500,0.038168,0.026667,0.071429,0.041667,0.038835,0.005128
4,0.013605,0.038462,0.076923,1.000000,0.009615,0.000000,0.013393,0.047619,0.000000,0.009346,...,0.005348,0.019231,0.040000,0.058824,0.033898,0.016129,0.043103,0.090909,0.056818,0.011111
5,0.102439,0.007692,0.008475,0.009615,1.000000,0.086486,0.105455,0.052174,0.019802,0.051136,...,0.147186,0.031746,0.040179,0.015504,0.064171,0.029412,0.093923,0.036697,0.075949,0.146018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.039548,0.098765,0.026667,0.016129,0.029412,0.052980,0.031496,0.012821,0.000000,0.014184,...,0.032407,0.257143,0.136905,0.126582,0.226562,1.000000,0.033113,0.092308,0.024000,0.058537
940,0.115207,0.088889,0.071429,0.043103,0.093923,0.160428,0.134752,0.045802,0.025862,0.161850,...,0.190678,0.050360,0.116592,0.097015,0.091371,0.033113,1.000000,0.057377,0.141104,0.113360
941,0.046667,0.087719,0.041667,0.090909,0.036697,0.039370,0.008584,0.083333,0.062500,0.008696,...,0.015544,0.109091,0.108108,0.127273,0.120690,0.092308,0.057377,1.000000,0.041237,0.027027
942,0.070000,0.062500,0.038835,0.056818,0.075949,0.098837,0.073529,0.028302,0.022472,0.074534,...,0.119469,0.035088,0.057416,0.053097,0.056497,0.024000,0.141104,0.041237,1.000000,0.083333


In [27]:
def TopKTetangga(similarity, k):
    # Mengambil k tetangga terdekat 
    # Mengurutkan similarity dalam urutan desending
    return np.argsort(-similarity)[:k]

#### JACCARD

### PREDIKSI

In [29]:
def prediksiCF(RatingMatriks, similarityFunction, mean, meanCen, user="user-2", item="item-2", k=2, jenis="userBased"):
    if jenis == "userBased":
        # mendapatkan item yang diberi rating oleh pengguna
        ratingMatriks = RatingMatriks.loc[:, item].to_numpy()
        # mendapatan mean centered
        meanCentered = meanCen.loc[:, item].to_numpy()
        # mendapatkan similarity
        similarity = similarityFunction.loc[user, :].to_numpy()
    elif jenis == "itemBased":
        # mendapatkan item yang diberi rating oleh pengguna
        ratingMatriks = RatingMatriks.loc[user, :].to_numpy()
        # mendapatan mean centered
        meanCentered = meanCen.loc[user, :].to_numpy()
        # mendapatkan similarity
        similarity = similarityFunction.loc[item, :].to_numpy()
    # cek index data user/item yang tidak sama dengan 0
    cek_Index = np.where(ratingMatriks != 0)
    # filter user/item rating, mean centered, similarity, sesuai index data yang tidak bernilai 0
    nilai_meanCentered = np.array(meanCentered)[cek_Index]
    nilai_similarity = similarity[cek_Index]
    # tetangga terdekat
    indexTetanggaSim = TopKTetangga(nilai_similarity, k)
    # pembilang 
    pembilang = np.sum(nilai_meanCentered[indexTetanggaSim] * nilai_similarity[indexTetanggaSim])
    # penyebut
    penyebut = np.abs(nilai_similarity[indexTetanggaSim]).sum()
    # cek apakah penyebut tidak sama dengan 0
    if penyebut != 0:
        if jenis == "userBased":
            # prediksi rating
            rumusPrediksi = mean.loc[user] + (pembilang / penyebut)
        elif jenis == "itemBased":
            # rumusPrediksi rating
            rumusPrediksi = mean.loc[item] + (pembilang / penyebut)
    else:
        if jenis == "userBased":
            rumusPrediksi = mean.loc[user] + 0
        elif jenis == "itemBased":
            rumusPrediksi = mean.loc[item] + 0

    return [item, float(rumusPrediksi)] 

In [30]:
def hitungKeseluruhanPrediksi(RatingMatriks, similarityFunction, mean, meanCen, k=2, jenis="userBased"):
    # Membuat DataFrame kosong untuk menyimpan prediksi
    prediksiMatriks = pd.DataFrame(index=RatingMatriks.index, columns=RatingMatriks.columns)
    # Iterasi untuk setiap user dan item dalam matriks rating
    for user in RatingMatriks.index:
        for item in RatingMatriks.columns:
            # Cek apakah rating untuk (user, item) adalah 0 atau tidak ada rating
            if RatingMatriks.loc[user, item] == 0:
                # Jika tidak ada rating, prediksi nilai rating menggunakan Collaborative Filtering
                prediksiMatriks.loc[user, item] = prediksiCF(RatingMatriks, similarityFunction, mean, meanCen, user, item, k, jenis)[1]
            else:
                # Jika sudah ada rating, simpan rating asli dari RatingMatriks
                prediksiMatriks.loc[user, item] = RatingMatriks.loc[user, item]
    # Mengembalikan DataFrame prediksi
    return prediksiMatriks

#### JACCARD

##### USER

In [None]:
variasiParameterKUser = [5, 10, 15, 18, 20, 25, 30, 40, 50, 100, 200]

In [31]:
def VariasiParameterKuser(rating_Matriks, similarityPkl, cal_MeanRatingUser, cal_MeanCenteredUser, variasiParameterKUser, folder="folds/base/prediksiJaccard"):
    for k in variasiParameterKUser:
        # prediksi rating user
        prediksiRatingUser = hitungKeseluruhanPrediksi(rating_Matriks, similarityPkl, cal_MeanRatingUser, cal_MeanCenteredUser, k=k, jenis="userBased")


        # simpan hasil prediksi
        k_Folder = os.path.join(folder, str(k))
        fileName = os.path.join(k_Folder, "prediksiRatingUser.pkl")
        dump(prediksiRatingUser, fileName)
        print(f"Prediksi Rating User dengan K = {k} : \n", prediksiRatingUser)


In [32]:
variasiParameterKUser2= [10, 15, 18, 20, 25, 30, 40, 50, 100, 200]

In [None]:
variasParameterTest = VariasiParameterKuser(rating_Matriks, similarityPkl, cal_MeanRatingUser, cal_MeanCenteredUser, variasiParameterKUser2)
variasParameterTest

Prediksi Rating User dengan K = 10 : 
 item_id      1         2         3         4         5         6         7     \
user_id                                                                         
1             5.0       3.0       4.0       3.0       3.0  3.428574       4.0   
2             4.0  3.082026   2.70023  2.943387  3.406015  4.071789  3.731388   
3        3.685022  2.189235   2.40361   2.39746  2.775452  2.585162  3.486864   
4        4.866876  3.415451  4.118955  3.855727  3.874346  4.182323  4.749067   
5        3.813181  2.830755  2.238277  3.026852  2.916303  2.577688  3.340588   
...           ...       ...       ...       ...       ...       ...       ...   
939      4.961745  4.043474  3.844867  3.905294  4.258798  4.652536  4.194443   
940      3.840441  3.245106  3.124006       2.0  2.971567  3.152146       4.0   
941           5.0  3.556297  3.828685  3.680798  3.540007   4.14252       4.0   
942      4.791495  3.994308  3.824021  4.110618  4.101831  4.013029   

##### ITEM

In [36]:
similarityPklItem = joblib.load('folds/simi/jaccard/item/similarityJaccardItemFolds1.pkl')
similarityPklItem

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.178744,0.150754,0.236607,0.091787,0.022843,0.416838,0.246637,0.302277,0.106796,...,0.002611,0.0,0.000000,0.000000,0.002611,0.0,0.0,0.0,0.002611,0.002611
2,0.178744,1.000000,0.139241,0.301887,0.175676,0.041667,0.187320,0.168067,0.103774,0.072289,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.009524,0.009524
3,0.150754,0.139241,1.000000,0.160377,0.090909,0.043956,0.147147,0.127273,0.150538,0.112782,...,0.000000,0.0,0.000000,0.000000,0.013333,0.0,0.0,0.0,0.000000,0.013333
4,0.236607,0.301887,0.160377,1.000000,0.170732,0.043716,0.257895,0.269373,0.208696,0.104072,...,0.000000,0.0,0.005848,0.005848,0.005848,0.0,0.0,0.0,0.005848,0.005848
5,0.091787,0.175676,0.090909,0.170732,1.000000,0.011364,0.139394,0.100000,0.097561,0.028986,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.014493
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,1.0,1.0,1.0,0.000000,0.000000
1681,0.002611,0.009524,0.000000,0.005848,0.000000,0.000000,0.003257,0.005780,0.004065,0.000000,...,0.000000,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.000000,0.000000


In [30]:
variasiParameterKItem2= [20]

In [33]:
def VariasiParameterKitem(rating_Matriks, similarityPkl, cal_MeanRatiingItem, cal_MeanCenterdItem, variasiParameterKItem, folder="folds/base/prediksiJaccard"):
    for k in variasiParameterKItem:
        # prediksi rating item
        prediksiRatingItem = hitungKeseluruhanPrediksi(rating_Matriks, similarityPkl, cal_MeanRatiingItem, cal_MeanCenterdItem, k=k, jenis="itemBased")


        # simpan hasil prediksi
        k_Folder = os.path.join(folder, str(k))
        fileName = os.path.join(k_Folder, "prediksiRatingItem.pkl")
        dump(prediksiRatingItem, fileName)
        print(f"Prediksi Rating Item dengan K = {k} : \n", prediksiRatingItem)

In [37]:
variasiParameterItemTest = VariasiParameterKitem(rating_Matriks, similarityPklItem, cal_MeanRatiingItem, cal_MeanCenterdItem, variasiParameterKItem2)
variasiParameterItemTest

Prediksi Rating Item dengan K = 20 : 
 item_id      1         2         3         4         5         6         7     \
user_id                                                                         
1             5.0       3.0       4.0       3.0       3.0  3.734238       4.0   
2             4.0  3.384031  3.141674  3.782772  3.509136  3.601089  4.026222   
3        3.314828  2.730854  2.453912  2.889695  2.900274  3.056189  3.194754   
4         4.43134  3.633199  3.641014  3.982584  3.928643  4.498459  4.365417   
5        3.951217  3.268206  2.962717  3.251767  2.796162  2.577517  3.805576   
...           ...       ...       ...       ...       ...       ...       ...   
939      4.983637  4.315029  4.255486  4.702212  4.528999  4.377967  4.884502   
940      3.768573  2.821651  2.827741       2.0   3.09104  2.859981       4.0   
941           5.0  3.557531  3.373307  3.912848   3.68891  3.836517       4.0   
942      4.445554  3.850113  3.734396  4.048385  3.942916  3.868599  4

### HYBRID

In [2]:
def HybridFiltering(PrediksiUser, PrediksiItem, gamma=0.7):
    # Mengonversi ke numpy array untuk perhitungan usser dan item
    prediksiUser = np.array(PrediksiUser)
    prediksiItem = np.array(PrediksiItem)
    # Menghitung prediksi hybrid
    rumusHybrid = gamma * prediksiUser + (1 - gamma) * prediksiItem
    # mengembalikan hasil prediksi hybrid
    return rumusHybrid

In [4]:
# Loop untuk variasi parameter K user dan k item pada hybrid
OpenPrediksiUser5 = joblib.load('folds/base/prediksiJaccard/5/prediksiRatingUser.pkl')
OpenPrediksiItem20 = joblib.load('folds/base/prediksiJaccard/20/prediksiRatingItem.pkl')

variasai1Parameter = HybridFiltering(OpenPrediksiUser5, OpenPrediksiItem20, gamma=0.7)
variasai1Parameter



array([[5.0, 3.0, 4.0, ..., 2.586356679094288, 3.573784973328384,
        3.272636506658987],
       [4.0, 3.271114381949305, 3.017188545874051, ...,
        2.5159514226980955, 3.6532653576315854, 3.4541274577083176],
       [3.664764474397259, 2.068780254200603, 2.2635875939642958, ...,
        1.913608354349484, 3.034814918450624, 2.404760957233546],
       ...,
       [5.0, 3.5708434023281406, 3.7950026221879813, ...,
        2.781592890147225, 3.816859639117496, 3.6048592541440287],
       [4.7483932291840345, 3.824876227398864, 3.554956798872428, ...,
        2.896636199179987, 4.06628247214439, 3.8304793952229206],
       [4.001593643100607, 5.0, 4.0547812466601565, ..., 2.9875,
        3.3270630004963033, 3.0306808624420793]], dtype=object)

In [18]:
# menjadi df

cal_HybridJaccard = pd.DataFrame(HybridFiltering(OpenPrediksiUser5, OpenPrediksiItem20, gamma=0.7), index=OpenPrediksiUser5.index, columns=OpenPrediksiUser5.columns)
cal_HybridJaccard

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,3.373355,4.0,1.0,5.0,3.918952,...,2.644312,3.79339,3.067821,2.067821,3.478335,1.586357,3.586357,2.586357,3.573785,3.272637
2,4.0,3.271114,3.017189,3.408163,3.777133,4.461393,3.930252,4.45463,4.343244,2.0,...,2.778854,3.934706,3.278433,2.278433,3.477556,1.515951,3.515951,2.515951,3.653265,3.454127
3,3.664764,2.06878,2.263588,2.986487,3.029622,2.577108,3.194165,3.221092,3.617479,3.378328,...,1.73329,3.03242,2.525192,1.525192,2.802551,0.913608,2.913608,1.913608,3.034815,2.404761
4,4.86221,3.236948,3.893285,3.885992,3.900799,4.449481,4.756084,4.564617,4.768798,4.522657,...,2.866985,4.18327,3.900589,2.900589,3.969186,2.359911,4.359911,3.359911,4.069952,3.730588
5,3.81046,2.951473,2.514216,2.909989,2.988051,2.271007,3.708853,3.785686,3.033168,3.265721,...,1.833384,3.153706,2.496755,1.496755,2.711136,2.369231,2.969231,2.669231,2.830023,2.611517
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,4.838311,4.065116,4.111478,3.96061,4.285808,4.837541,4.00287,4.92219,5.0,4.603543,...,3.399594,4.420466,3.832092,2.832092,4.097852,2.180533,4.180533,3.180533,4.248364,3.888925
940,3.512784,3.123083,2.589892,2.0,2.638109,2.849822,4.0,5.0,3.0,3.586465,...,2.319451,3.507866,2.757821,1.757821,3.112226,1.147886,3.147886,2.147886,3.260358,2.762873
941,5.0,3.570843,3.795003,3.240989,3.72602,4.037658,4.0,4.808478,4.043456,3.981155,...,3.079816,4.110704,3.5553,2.5553,3.702632,1.781593,3.781593,2.781593,3.81686,3.604859
942,4.748393,3.824876,3.554957,4.062903,4.020377,4.010546,4.23417,4.997227,4.429399,4.561315,...,3.232626,4.235541,3.808089,2.808089,3.847601,1.896636,3.896636,2.896636,4.066282,3.830479


### LOOP VARIASI PERHITUNGAN HYBRID

In [None]:
# Loop dari 1 sampai 10 untuk variasi file user

for i in range(variasiParameterKUser2[0], 11):
    user_path = f'folds/base/prediksiJaccard/{i}/prediksiRatingUser.pkl'
    item_path = 'folds/base/prediksiJaccard/20/prediksiRatingItem.pkl'  # Tetap pakai item ke-20
    try:
        prediksi_user = joblib.load(user_path)
        prediksi_item = joblib.load(item_path)

        hybrid_model = HybridFiltering(prediksi_user, prediksi_item, gamma=0.7)
        
        # Lakukan sesuatu dengan hybrid_model
        print(f'Variasi {i}: Hybrid model berhasil dibuat')
    except Exception as e:
        print(f'Gagal memproses variasi {i}: {e}')

### TOP-N REKOMENDASI

In [None]:
def TopNRecommendationPerUser(hybridPredictions, N=5):
    # Membuat dictionary untuk menyimpan rekomendasi teratas per user
    topNPerUser = {}
    # Iterasi setiap user dalam DataFrame untuk mendapatkan N rekomendasi teratas
    for user in hybridPredictions.index:
        # Ambil prediksi untuk user tertentu
        userPredictions = hybridPredictions.loc[user]
        # Menggunakan argsort untuk mengurutkan prediksi rating dari yang tertinggi
        # argsort mengembalikan indeks yang diurutkan dari array
        topNIndices = userPredictions.argsort()[::-1][:N] 
        # Ambil item terkait dengan indeks teratas
        topNItems = userPredictions.index[topNIndices].tolist()
        # Simpan hasil rekomendasi untuk user tersebut
        topNPerUser[user] = topNItems
    # mengembalikan hasil rekomendasi
    return topNPerUser

In [1]:
def FullRecommendation(prediksi_matrix):
    ranking_per_user = np.argsort(-prediksi_matrix, axis=1)+1
    return ranking_per_user


In [None]:
cal_HybirdVariasi1 = FullRecommendation(variasai1Parameter)
cal_HybirdVariasi1

array([[1450, 1567, 1480, ..., 1629, 1589, 1627],
       [1567, 1621, 1285, ...,  309, 1586, 1627],
       [ 321,  340,  320, ..., 1589, 1627, 1586],
       ...,
       [1450, 1567, 1480, ..., 1586, 1589, 1627],
       [1450, 1567, 1480, ..., 1589, 1586, 1627],
       [1450,  100,   56, ..., 1629, 1589, 1627]], dtype=int64)

## TEST MODEL

### GROUND TRUTH

In [19]:
#to list
ground_truth = ConvertRatingMatriks(testdata).values.tolist()
ground_truth


[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  5.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  5.0,
  0.0,
  5.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  4.0,
  0.0,
  0.0,
  4.0,
  3.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  4.0,
  0.0,
  0.0,
  2.0,
  0.0,
  0.0,
  4.0,
  0.0,
  0.0,
  0.0,
  0.0,
  5.0,
  0.0,
  0.0,
  4.0,
  0.0,
  3.0,
  0.0,
  4.0,
  0.0,
  3.0,
  3.0,
  0.0,
  4.0,
  0.0,
  0.0,
  0.0,
  5.0,
  4.0,
  3.0,
  0.0,
  5.0,
  4.0,
  0.0,
  3.0,
  0.0,
  3.0,
  3.0,
  0.0,
  4.0,
  3.0,
  1.0,
  0.0,
  4.0,
  0.0,
  1.0,
  0.0,
  4.0,
  5.0,
  5.0,
  0.0,
  4.0,
  3.0,
  5.0,
  0.0,
  0.0,
  0.0,
  4.0,
  5.0,
  3.0,
  0.0,
  0.0,
  0.0,
  5.0,
  3.0,
  4.0,
  0.0,
  5.0,
  0.0,
  2.0,
  1.0,
  1.0,
  0.0,
  0.0,
  4.0,
  5.0,
  0.0,
  0.0,
  0.0,
  1.0,
  5.0,
  5.0,
  0.0,
  0.0,
  3.0,
  3.0,
  0.0,
  1.0,
  4.0,
  0.0,
  0.0,
  0.0,
  3.0,
  0.0,
  0.0,
  4.0,
  5.0,
  3.0,
  0.0,
  4.0,
  0.0,
  4.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  1.0,
  0.0,
  0.0,
  1.0,

### NDCG

In [20]:
def EvaluasiDCG(GroundTruth, TopNRekomendasi, N):
    dcg_score = 0.0
    for i in range(min(N, len(TopNRekomendasi))):
        if TopNRekomendasi[i] in GroundTruth:
            dcg_score += 1 / np.log2(i + 2)  
    return dcg_score

In [21]:
cal_DCG = EvaluasiDCG(ground_truth[0], cal_HybirdVariasi1[0], 5)
cal_DCG

0.43067655807339306

In [22]:
def EvaluasiIDCG(N):
    # Inisialisasi IDCG
    idcg = 0.0
    # Hitung IDCG untuk N item teratas
    for n in range(1, N + 1):
        # Menghitung nilai relevansi ideal
        relevansi_ideal = 1  # Asumsikan rating tertinggi adalah 1
        # Menambahkan ke IDCG dengan formula relevansi/log2 posisi
        idcg += relevansi_ideal / math.log2(n + 1)
    # Mengembalikan nilai IDCG
    return idcg

In [23]:
def EvaluasiNDCG(GroundTruth, TopNRekomendasi, N):
    # Hitung DCG
    dcg = EvaluasiDCG(GroundTruth, TopNRekomendasi, N)
    # Hitung IDCG
    idcg = EvaluasiIDCG(N)
    # Menghitung NDCG
    rumusNDCG = dcg / idcg if idcg != 0 else 0

    return rumusNDCG


In [26]:
variasiNDCG = list(range(1, 100))  # dari 1 sampai 20
ndcg_scores = {}

for N in variasiNDCG:
    score = EvaluasiNDCG(ground_truth[0], cal_HybirdVariasi1[0], N=N)
    ndcg_scores[N] = score
    print(f"NDCG@{N} = {score:.4f}")

NDCG@1 = 0.0000
NDCG@2 = 0.0000
NDCG@3 = 0.0000
NDCG@4 = 0.1681
NDCG@5 = 0.1461
NDCG@6 = 0.1303
NDCG@7 = 0.1184
NDCG@8 = 0.1089
NDCG@9 = 0.1012
NDCG@10 = 0.0948
NDCG@11 = 0.0893
NDCG@12 = 0.0846
NDCG@13 = 0.0804
NDCG@14 = 0.0768
NDCG@15 = 0.0735
NDCG@16 = 0.0705
NDCG@17 = 0.0679
NDCG@18 = 0.0654
NDCG@19 = 0.0632
NDCG@20 = 0.0612
NDCG@21 = 0.0593
NDCG@22 = 0.0575
NDCG@23 = 0.0559
NDCG@24 = 0.0544
NDCG@25 = 0.0530
NDCG@26 = 0.0516
NDCG@27 = 0.0504
NDCG@28 = 0.0492
NDCG@29 = 0.0481
NDCG@30 = 0.0470
NDCG@31 = 0.0460
NDCG@32 = 0.0451
NDCG@33 = 0.0441
NDCG@34 = 0.0433
NDCG@35 = 0.0425
NDCG@36 = 0.0417
NDCG@37 = 0.0409
NDCG@38 = 0.0402
NDCG@39 = 0.0395
NDCG@40 = 0.0388
NDCG@41 = 0.0382
NDCG@42 = 0.0376
NDCG@43 = 0.0370
NDCG@44 = 0.0364
NDCG@45 = 0.0359
NDCG@46 = 0.0353
NDCG@47 = 0.0348
NDCG@48 = 0.0343
NDCG@49 = 0.0339
NDCG@50 = 0.0334
NDCG@51 = 0.0329
NDCG@52 = 0.0325
NDCG@53 = 0.0321
NDCG@54 = 0.0317
NDCG@55 = 0.0313
NDCG@56 = 0.0309
NDCG@57 = 0.0305
NDCG@58 = 0.0302
NDCG@59 = 0.0298
NDCG@6

## SAVE MODEL

### SAVE PATH FOLDER 

### SAVE MODEL #1