In [5]:
import pandas as pd
import os
import numpy as np
import random

base_path : str = os.path.dirname(os.getcwd())
CSV_PATH  : str = base_path + '\\csv'
SRC_PATH  : str = base_path + '\\src'
JSON_PATH : str = base_path + '\\json'

In [6]:
user_rating = pd.read_json(JSON_PATH + r'\USER_RATING_DATA.json')
user_rating

Unnamed: 0,USER_ID,RATED,NAN_RATED
0,5,"{'1': 2.0, '4': 2.0, '6': 3.0, '8': 4.0, '12':...","{'0': -1.0, '2': -1.0, '3': -1.0, '5': -1.0, '..."
1,12,"{'2': 2.0, '3': 5.0, '5': 4.0, '9': 1.0, '10':...","{'0': -1.0, '1': -1.0, '4': -1.0, '6': -1.0, '..."
2,19,"{'0': 5.0, '1': 2.0, '5': 2.0, '6': 5.0, '10':...","{'2': -1.0, '3': -1.0, '4': -1.0, '7': -1.0, '..."
3,24,"{'1': 5.0, '4': 5.0, '5': 5.0, '6': 5.0, '8': ...","{'0': -1.0, '2': -1.0, '3': -1.0, '7': -1.0, '..."
4,31,"{'1': 0.0, '6': 3.0, '7': 1.0, '10': 0.0, '11'...","{'0': -1.0, '2': -1.0, '3': -1.0, '4': -1.0, '..."
...,...,...,...
353,1796,"{'0': 3.0, '1': 1.0, '3': 3.0, '6': 0.0, '7': ...","{'2': -1.0, '4': -1.0, '5': -1.0, '8': -1.0, '..."
354,1804,"{'1': 3.0, '2': 2.0, '3': 4.0, '9': 3.0, '10':...","{'0': -1.0, '4': -1.0, '5': -1.0, '6': -1.0, '..."
355,1813,"{'0': 3.0, '1': 5.0, '2': 5.0, '4': 3.0, '5': ...","{'3': -1.0, '7': -1.0, '8': -1.0, '9': -1.0, '..."
356,1815,"{'2': 4.0, '3': 3.0, '4': 2.0, '5': 1.0, '48':...","{'0': -1.0, '1': -1.0, '6': -1.0, '7': -1.0, '..."


In [7]:
movie_distance = pd.read_json(JSON_PATH + r'\movie_distance_graph.json')

In [8]:
def generate_weight_samples(num_samples=10000, weight_count=5, min_val=0.5, max_val=1.1, tailored_tholds: bool = True):
    import random
    samples = []   
    
    if tailored_tholds:
        t1 = [0.2, 0.4, 0.6, 0.8, 1.01]
        t2 = t1
        t3 = [0.33, 0.66, 1.01]
        t4 = [0.5, 0.9, 1.01]
        t5 = t1
        for p1 in t1:        
            for p2 in t2:        
                for p3 in t3:       
                    for p4 in t4:       
                        for p5 in t5:    
                            samples.append([p1,p2,p3,p4,p5])   

    else:
        for _ in range(num_samples):
            # Generate weights freely within the range
            weights = [random.uniform(min_val, max_val) for _ in range(weight_count)]
            samples.append(weights)
    
    return samples

PARAMS_WEIGHTS = generate_weight_samples()
len(PARAMS_WEIGHTS)

1125

In [9]:
def decision_tree(movie_id, all_distances, training_ids, rating_data, threshold) -> int:

    """
        Dla movie_id wyszukujemyt filmy, które oddalone są od movie_id o mniej niż THRESHOLD.
        Jeśli taki jest, to zapamiętujemy jego ocenę, a potem ze wszystkich wyciątgamy średnią.
    """
    
    decisions: list = []

    for rated_movie in training_ids:

        feature_counter: int = 0

        for feat_dist_i, feature_distance in enumerate(all_distances[movie_id][int(rated_movie)]):

            if feature_distance < threshold[feat_dist_i]:

                feature_counter += 1

        if feature_counter == 5:

            decisions.append(rating_data[str(rated_movie)])


    return np.ceil(np.average(decisions)) if len(decisions) > 0 else None

def optimize_user(user, validate_ids, training_ids) -> tuple[float, list]:

    max_accuracy: float = -1.0
    best_weights: list = []

    for weights in PARAMS_WEIGHTS:

        accuracy: float = 0.0

        for validate_movie in validate_ids:

            # output = decision_tree(movie_id=validate_movie, all_distances=movie_distance, training_ids=training_ids, rating_data={k: v for k, v in user_rating["RATED"][user].items() if int(k) in [training_ids+validate_ids]}, threshold=weights)
            output = decision_tree(movie_id=validate_movie, all_distances=movie_distance, training_ids=training_ids, rating_data=user_rating["RATED"][user], threshold=weights)

            # print(f"Output: {output}, True value: {user_rating['RATED'][user][str(validate_movie)]}")

            if output is not None and output == user_rating['RATED'][user][str(validate_movie)]:
                accuracy += 1.0
        
        accuracy = accuracy/len(validate_ids)

        if accuracy > max_accuracy:

            max_accuracy = accuracy
            best_weights = weights
    
    return (max_accuracy, best_weights)

def test_user(user_id, best_weights, test_ids, train_ids) -> float:

    accuracy: float = 0.0

    for test_movie in test_ids:

        # output = decision_tree(movie_id=validate_movie, all_distances=movie_distance, training_ids=training_ids, rating_data={k: v for k, v in user_rating["RATED"][user].items() if int(k) in [training_ids+validate_ids]}, threshold=weights)
        output = decision_tree(movie_id=test_movie, all_distances=movie_distance, training_ids=train_ids, rating_data=user_rating["RATED"][user_id], threshold=best_weights)

        if output is not None and output == user_rating['RATED'][user_id][str(test_movie)]:
            accuracy += 1.0
    
    accuracy = accuracy/len(test_ids)

    return accuracy


In [10]:
NUM_OF_CROSS_VALIDATION = 3
NUM_OF_TRAINED_USERS = len(user_rating)

list_best_weights_out: list = []

for user in range(NUM_OF_TRAINED_USERS):

    accuracy = 0
    best_k_out = 0
    best_weights_out = [1.1,1.1,1.1,1.1,1.1]

    for validation_id in range(NUM_OF_CROSS_VALIDATION):

        keys = list(user_rating['RATED'][user].keys())
        random.shuffle(keys)
        split_index = int(len(keys) * 0.9)
        train_valid_keys = keys[:split_index]
        test_keys = keys[split_index:]

        keys = list(train_valid_keys)
        random.shuffle(keys)
        split_index = int(len(keys) * 0.85)
        train_keys = keys[:split_index]
        valid_keys = keys[split_index:]


        test_ids = [int(key) for key in test_keys]
        validate_ids = [int(key) for key in valid_keys]
        training_ids = [int(key) for key in train_keys]

        # print(f'Train keys: {train_keys}\nValid keys: {valid_keys}\nTest keys: {test_keys}')

        _, temp_best_weights_out = optimize_user(user, validate_ids, training_ids)

        temp_accuracy = test_user(user_id=user, best_weights=temp_best_weights_out, test_ids=test_ids, train_ids=train_valid_keys)

        if temp_accuracy > accuracy:
            accuracy = temp_accuracy
            best_weights_out = temp_best_weights_out
        
    print(f"{user}. User: {user_rating['USER_ID'][user]} weights are: {best_weights_out} with appx accuracy of {accuracy:.2f}") if accuracy is not 0 else print(f"{user}. {user_rating['USER_ID'][user]} has accuracy of 0!")
    list_best_weights_out.append(best_weights_out)



  print(f"{user}. User: {user_rating['USER_ID'][user]} weights are: {best_weights_out} with appx accuracy of {accuracy:.2f}") if accuracy is not 0 else print(f"{user}. {user_rating['USER_ID'][user]} has accuracy of 0!")


0. User: 5 weights are: [0.2, 0.4, 1.01, 1.01, 1.01] with appx accuracy of 0.89
1. User: 12 weights are: [0.4, 0.4, 1.01, 1.01, 0.4] with appx accuracy of 0.44
2. User: 19 weights are: [0.4, 0.6, 1.01, 1.01, 0.8] with appx accuracy of 0.33
3. User: 24 weights are: [0.2, 0.2, 1.01, 1.01, 1.01] with appx accuracy of 0.89
4. User: 31 weights are: [0.4, 0.2, 1.01, 1.01, 0.4] with appx accuracy of 0.22
5. User: 52 weights are: [0.4, 0.6, 1.01, 1.01, 0.8] with appx accuracy of 0.33
6. User: 62 weights are: [0.2, 0.2, 1.01, 1.01, 0.2] with appx accuracy of 0.11
7. User: 63 weights are: [0.4, 0.4, 1.01, 1.01, 0.4] with appx accuracy of 0.22
8. User: 68 weights are: [0.2, 0.4, 1.01, 1.01, 1.01] with appx accuracy of 0.56
9. User: 69 weights are: [0.4, 0.2, 1.01, 1.01, 0.6] with appx accuracy of 0.33
10. User: 70 weights are: [0.4, 0.6, 1.01, 1.01, 1.01] with appx accuracy of 0.22
11. User: 71 weights are: [0.6, 0.2, 1.01, 1.01, 0.8] with appx accuracy of 0.56
12. User: 78 weights are: [0.2, 0.2

In [11]:
user_test_data = {
    user_rating['USER_ID'][i]: list_best_weights_out[i] for i in range(NUM_OF_TRAINED_USERS)
}
user_test_data

{5: [0.2, 0.4, 1.01, 1.01, 1.01],
 12: [0.4, 0.4, 1.01, 1.01, 0.4],
 19: [0.4, 0.6, 1.01, 1.01, 0.8],
 24: [0.2, 0.2, 1.01, 1.01, 1.01],
 31: [0.4, 0.2, 1.01, 1.01, 0.4],
 52: [0.4, 0.6, 1.01, 1.01, 0.8],
 62: [0.2, 0.2, 1.01, 1.01, 0.2],
 63: [0.4, 0.4, 1.01, 1.01, 0.4],
 68: [0.2, 0.4, 1.01, 1.01, 1.01],
 69: [0.4, 0.2, 1.01, 1.01, 0.6],
 70: [0.4, 0.6, 1.01, 1.01, 1.01],
 71: [0.6, 0.2, 1.01, 1.01, 0.8],
 78: [0.2, 0.2, 1.01, 1.01, 0.8],
 80: [1.1, 1.1, 1.1, 1.1, 1.1],
 90: [0.4, 0.6, 1.01, 1.01, 0.4],
 92: [0.2, 0.2, 1.01, 1.01, 1.01],
 93: [0.2, 0.4, 1.01, 1.01, 1.01],
 105: [0.2, 0.2, 1.01, 1.01, 0.8],
 106: [0.2, 0.2, 1.01, 1.01, 1.01],
 109: [0.2, 0.2, 1.01, 1.01, 0.8],
 118: [0.4, 0.6, 1.01, 1.01, 0.8],
 125: [0.2, 0.4, 1.01, 1.01, 0.8],
 131: [0.8, 0.4, 1.01, 1.01, 0.4],
 139: [0.4, 0.6, 1.01, 1.01, 0.4],
 140: [0.2, 0.4, 1.01, 1.01, 0.6],
 148: [0.2, 0.4, 1.01, 1.01, 0.8],
 149: [0.8, 0.8, 1.01, 1.01, 0.6],
 151: [0.2, 0.2, 1.01, 1.01, 1.01],
 154: [0.2, 0.4, 1.01, 1.01, 0.6

In [12]:
user_test_data_df = pd.DataFrame(user_test_data)
user_test_data_df.to_json(JSON_PATH + '\\USER_HYPER_PARAMS_old.json', indent=4)

In [13]:
user_test_data = {
    str(user_rating['USER_ID'][i]): list_best_weights_out[i]
    for i in range(NUM_OF_TRAINED_USERS)
}

import json

with open(JSON_PATH + r'\USER_HYPER_PARAMS.json', 'w') as f:
    json.dump(user_test_data, f, indent=4)

output = decision_tree(movie_id=2, all_distances=movie_distance, rating_data=user_rating.loc[user_rating["USER_ID"] == 5, "RATED"][0], threshold=[0.7, 0.7, 2, 2, 0.7])
output