# Case Study 1

In [1]:
import pandas as pd      
import numpy as np   
import pickle
file_path = "C:/Users/bodya/Downloads/Statistical_Analysis/statistical_analysis.pkl"
with open(file_path, 'rb') as f:
    data = pickle.load(f)
dataset = data['dataset']
n_u = data['number_of_ratings_for_each_user_(n_u)']
n_i = data['number_of_ratings_for_each_item_(n_i)']

r_u = data['average_rating_per_user_(r_u)']
r_i = data['average_rating_per_item_(r_i)']

ordered_items_counts = data['ordered_items_counts']

groups = data['item_rating_groups_(groups)']
group_count = data['group_count_by_index_(group_count)']
ratings_per_group = data['ratings_per_group']

targets_users = data['targets_users']
targets_items = data['targets_items']

number_of_common_users = data['number_of_common_users']
number_of_common_items = data['number_of_common_items']

I1_id = targets_items[0]
I2_id = targets_items[1]

In [6]:
centered_dataset = dataset.merge(r_u, on='user-id', how='left')
centered_dataset['rating_centered'] = centered_dataset['rating_x'] - centered_dataset['rating_y']
centered_dataset = centered_dataset.drop(columns=["rating_x","rating_y"])
centered_dataset

Unnamed: 0,user-id,item-id,rating_centered
0,A0040548BPHKXMHH3NTI,6303998240,-1.083333
1,A0040548BPHKXMHH3NTI,B000LC4ZI0,0.916667
2,A0040548BPHKXMHH3NTI,B0013J30YU,-3.083333
3,A0040548BPHKXMHH3NTI,B003GAMOLY,-0.083333
4,A0040548BPHKXMHH3NTI,B004ISM6E8,-0.083333
...,...,...,...
1523622,AZZZ159U3Q5OO,B00005JO28,-2.000000
1523623,AZZZ159U3Q5OO,B0002WYTWG,1.000000
1523624,AZZZ159U3Q5OO,B000A8NZ0O,0.000000
1523625,AZZZ159U3Q5OO,B00Z9YZRVE,0.000000


# Apply Item-Based CF using Cosine Similarity with Mean-Centering

In [10]:
item_users = dataset.groupby('item-id')['user-id'].apply(set).to_dict()
item_user_centered_ratings = centered_dataset.groupby(['item-id', 'user-id'])['rating_centered'].mean().reset_index()

def Item_Based_CF_MCS(target_item_id, item_user_ratings_df, item_users_dict):
    """
    Calculates how similar a 'target' item is to all other items.
    It uses 'mean-centered' ratings (adjusted for user optimism/pessimism).
    """
    target_users = item_users_dict.get(target_item_id, set())
    
    if len(target_users) == 0:
        return pd.DataFrame()
    
    candidate_items = set()
    for item_id, users in item_users_dict.items():
        if item_id != target_item_id and len(target_users & users) > 0:
            candidate_items.add(item_id)
    
    print(f"{target_item_id} has {len(target_users)} raters")
    print(f"{len(candidate_items)} items with common raters")
    
    if len(candidate_items) == 0:
        return pd.DataFrame()
    
    target_ratings = item_user_ratings_df[item_user_ratings_df['item-id'] == target_item_id].set_index('user-id')['rating_centered']
    
    similarities = []
    
    for candidate_id in candidate_items:
        candidate_ratings = item_user_ratings_df[item_user_ratings_df['item-id'] == candidate_id].set_index('user-id')['rating_centered']
        
        common_users = list(set(target_ratings.index) & set(candidate_ratings.index))
        
        if len(common_users) < 1:
            continue
        
        target_vec = target_ratings.loc[common_users].values
        candidate_vec = candidate_ratings.loc[common_users].values
        
        dot_product = 0.0
        for i in range(len(target_vec)):
            dot_product += target_vec[i] * candidate_vec[i]
        
        sum_of_squares_target = 0.0
        for val in target_vec:
            sum_of_squares_target += val ** 2
        norm_target = sum_of_squares_target ** 0.5 
        
        sum_of_squares_candidate = 0.0
        for val in candidate_vec:
            sum_of_squares_candidate += val ** 2
        norm_candidate = sum_of_squares_candidate ** 0.5
        
        if norm_target > 0 and norm_candidate > 0:
            similarity = dot_product / (norm_target * norm_candidate)
        else:
            similarity = 0.0
            
        similarities.append({
            'item-id': candidate_id,
            'similarity': round(similarity, 4),
            'common_users': len(common_users)
        })
    
    return pd.DataFrame(similarities).sort_values('similarity', ascending=False)

Item_1_Similarities = Item_Based_CF_MCS(I1_id, item_user_centered_ratings, item_users)
Item_2_Similarities = Item_Based_CF_MCS(I2_id, item_user_centered_ratings, item_users)

B00PCSVODW has 31 raters
526 items with common raters
B005GISDXW has 28 raters
480 items with common raters


# Identify Top 20% of Similar Items for Each Target Item

In [15]:
def Top_similar_20_percent_similar_items(Item_similar_MCS, top_percent=0.20):
    if len(Item_similar_MCS) == 0:
        return pd.DataFrame()
    n_items = max(1, int(len(Item_similar_MCS) * top_percent))
    return Item_similar_MCS.head(n_items)

In [16]:
I1_top20 = Top_similar_20_percent_similar_items(Item_1_Similarities, 0.20)
print(f"Top 20% similar items for I1 ({I1_id})")
print(f"Total similar items: {len(Item_1_Similarities)}")
print(f"Top 20% count: {len(I1_top20)}")
I1_top20

Top 20% similar items for I1 (B00PCSVODW)
Total similar items: 526
Top 20% count: 105


Unnamed: 0,item-id,similarity,common_users
0,B00005JOQB,1.0,1
193,B00AZKY0OW,1.0,1
170,B00GSTHD02,1.0,1
430,B00EDM2RN4,1.0,1
174,B001BMN35K,1.0,1
...,...,...,...
260,B005LAII94,1.0,1
1,B01CRIWR0S,1.0,1
378,B0053O8A50,1.0,1
266,B004919RHI,1.0,1


In [17]:
I2_top20 = Top_similar_20_percent_similar_items(Item_2_Similarities, 0.20)
print(f"Top 20% similar items for I1 ({I2_id})")
print(f"Total similar items: {len(Item_2_Similarities)}")
print(f"Top 20% count: {len(I2_top20)}")
I2_top20

Top 20% similar items for I1 (B005GISDXW)
Total similar items: 480
Top 20% count: 96


Unnamed: 0,item-id,similarity,common_users
144,6305006571,1.0,1
95,B000059PSE,1.0,1
349,0967501032,1.0,1
351,B00U2YNO7U,1.0,1
424,6303507689,1.0,1
...,...,...,...
276,078885996X,1.0,1
279,B00UOB45SS,1.0,1
473,078060718X,1.0,1
10,B00KDF2NL6,1.0,1


# Predict Missing Ratings Using Similar Items

In [18]:
user_item_ratings = centered_dataset.set_index(['user-id', 'item-id'])['rating_centered'].to_dict()

item_means_dict = r_i.to_dict()

def predict_rating_item_based(user_id, target_item_id, similar_items_df, item_means_dict, user_item_ratings):
    if len(similar_items_df) == 0:
        return item_means_dict.get(target_item_id, 3.0)
    
    target_mean = item_means_dict.get(target_item_id, 3.0)
    
    numerator = 0
    denominator = 0
    
    for _, row in similar_items_df.iterrows():
        similar_item_id = row['item-id']
        similarity = row['similarity']
        
        rating = user_item_ratings.get((user_id, similar_item_id), None)
        
        if rating is not None:
            similar_item_mean = item_means_dict.get(similar_item_id, 3.0)
            numerator += similarity * (rating - similar_item_mean)
            denominator += abs(similarity)
    
    if denominator > 0:
        prediction = target_mean + (numerator / denominator)
        prediction = max(1, min(5, prediction))
    else:
        prediction = target_mean
    
    return round(prediction, 2)


In [19]:
def get_users_to_predict(target_item_id, similar_items_df, ratings_df):
    users_rated_target = set(ratings_df[ratings_df['item-id'] == target_item_id]['user-id'])
    
    if len(similar_items_df) == 0:
        return []
    
    if 'item-id' in similar_items_df.columns:
        similar_item_ids = similar_items_df['item-id'].tolist()
    elif 'item_id' in similar_items_df.columns:
         similar_item_ids = similar_items_df['item_id'].tolist()
    else:
        similar_item_ids = similar_items_df.index.tolist()

    try:
        similar_item_ids = [int(x) for x in similar_item_ids]
    except:
        pass
        
    users_rated_similar = set(ratings_df[ratings_df['item-id'].isin(similar_item_ids)]['user-id'])
    
    users_to_predict = users_rated_similar - users_rated_target
    return list(users_to_predict)

I1_users_to_predict = get_users_to_predict(I1_id, I1_top20, centered_dataset)
I2_users_to_predict = get_users_to_predict(I2_id, I2_top20, centered_dataset)

print(f"Users to predict for I1: {len(I1_users_to_predict)}")
print(f"Users to predict for I2: {len(I2_users_to_predict)}")

Users to predict for I1: 22935
Users to predict for I2: 21841


In [23]:
I1_predictions = []
for user_id in I1_users_to_predict[:50]:
    pred = predict_rating_item_based(user_id, I1_id, I1_top20, item_means_dict, user_item_ratings)
    I1_predictions.append({
        'user-id': user_id,
        'item-id': I1_id,
        'predicted_rating': pred
    })
I1_predictions_df = pd.DataFrame(I1_predictions)

print(I1_id)
print(f"Number of predictions: {len(I1_predictions_df)}")
print(f"Mean predicted rating: {I1_predictions_df['predicted_rating'].mean():.2f}")
print(f"Std of predictions: {I1_predictions_df['predicted_rating'].std():.2f}")
I1_predictions_df.head(20)

B00PCSVODW
Number of predictions: 50
Mean predicted rating: 1.00
Std of predictions: 0.00


Unnamed: 0,user-id,item-id,predicted_rating
0,A13H94ACZ45D07,B00PCSVODW,1
1,AUWPA25RWPK07,B00PCSVODW,1
2,A31S41WISP6QAH,B00PCSVODW,1
3,A3BKODF178BPCV,B00PCSVODW,1
4,A2SGZTPTETK9TA,B00PCSVODW,1
5,AE67JM1D32DRZ,B00PCSVODW,1
6,A1CFHSU2FQU8RC,B00PCSVODW,1
7,A3VDCZ2Z4SZCOT,B00PCSVODW,1
8,A2908FSWRQRFEY,B00PCSVODW,1
9,A15LUHWU8SIN0E,B00PCSVODW,1


In [25]:
I2_predictions = []
for user_id in I2_users_to_predict[:50]:
    pred = predict_rating_item_based(user_id, I2_id, I2_top20, item_means_dict, user_item_ratings)
    I2_predictions.append({
        'user-id': user_id,
        'item-id': I2_id,
        'predicted_rating': pred
    })

I2_predictions_df = pd.DataFrame(I2_predictions)
print(I2_id)
print(f"Number of predictions: {len(I2_predictions_df)}")
print(f"Mean predicted rating: {I2_predictions_df['predicted_rating'].mean():.2f}")
print(f"Std of predictions: {I2_predictions_df['predicted_rating'].std():.2f}")
I2_predictions_df.head(20)

B005GISDXW
Number of predictions: 50
Mean predicted rating: 1.00
Std of predictions: 0.00


Unnamed: 0,user-id,item-id,predicted_rating
0,A35ZJY01RQSY5O,B005GISDXW,1
1,A2X1KK7IM7BA8A,B005GISDXW,1
2,AV59T7O044ZAW,B005GISDXW,1
3,A2B6RPPFW0PC7T,B005GISDXW,1
4,A268A6S63M70ZM,B005GISDXW,1
5,A2908FSWRQRFEY,B005GISDXW,1
6,A3LO76VZJYTE60,B005GISDXW,1
7,A1I87KEHTUIW7W,B005GISDXW,1
8,A38INFPI2DAUE8,B005GISDXW,1
9,A1P6RQKZ880ZQD,B005GISDXW,1


# Compute DF and DS 

In [28]:
def compute_df_ds(similar_items_df, item_means_dict):
    if len(similar_items_df) == 0:
        return pd.DataFrame()
    
    results = []
    
    for _, row in similar_items_df.iterrows():
        item_id = row['item-id']
        similarity = row['similarity']
        common_users = row['common_users']
        
        avg_rating = item_means_dict.get(item_id, 3.0)
        
        decision_function = similarity
        decision_score = similarity * avg_rating
        
        results.append({
            'item-id': item_id,
            'similarity': similarity,
            'common_users': common_users,
            'avg_rating': round(avg_rating, 2),
            'DF': round(decision_function, 4),
            'DS': round(decision_score, 4)
        })
    
    return pd.DataFrame(results).sort_values('DS', ascending=False)

In [29]:
I1_df_ds = compute_df_ds(Item_1_Similarities, item_means_dict)

print(f"DF and DS FOR 1st Target Item Similar Items({I1_id})")
I1_df_ds.head(20)

DF and DS FOR 1st Target Item Similar Items(B00PCSVODW)


Unnamed: 0,item-id,similarity,common_users,avg_rating,DF,DS
155,B00466HN86,1.0,1,4.86,1.0,4.8567
88,B00005MKOL,1.0,1,4.76,1.0,4.7595
46,B000CCW2VQ,1.0,1,4.76,1.0,4.7586
65,B000UY6OU6,1.0,1,4.74,1.0,4.7444
161,B00068NVMK,1.0,1,4.73,1.0,4.7262
102,B0053O8A50,1.0,1,4.73,1.0,4.7259
111,B002N5N5M0,1.0,1,4.69,1.0,4.6939
124,B00004RYKP,1.0,1,4.69,1.0,4.6852
115,B00465I1BA,1.0,1,4.68,1.0,4.6792
6,6300184269,1.0,1,4.66,1.0,4.6633


In [30]:
I2_df_ds = compute_df_ds(Item_2_Similarities, item_means_dict)
print(f"DF and DS FOR 2nd Target Item Similar Items({I2_id})")
I2_df_ds.head(20)


DF and DS FOR 2nd Target Item Similar Items(B005GISDXW)


Unnamed: 0,item-id,similarity,common_users,avg_rating,DF,DS
29,6304500769,1.0,1,4.8,1.0,4.8039
113,B00440FO6M,1.0,1,4.77,1.0,4.7717
119,B005LOBWA2,1.0,1,4.77,1.0,4.7671
37,B00TPL8DXQ,1.0,1,4.73,1.0,4.7313
116,6301175239,1.0,1,4.72,1.0,4.7182
7,6304162200,1.0,1,4.71,1.0,4.7123
21,B004FUYSV8,1.0,1,4.69,1.0,4.6885
111,B000UAE7IQ,1.0,1,4.68,1.0,4.681
97,0790732181,1.0,1,4.66,1.0,4.6644
107,B00006FDCP,1.0,1,4.66,1.0,4.6629


# Select Top 20% Items Using DS

In [31]:
def get_top_percent_by_ds(df_ds_results, top_percent=0.20):
    if len(df_ds_results) == 0:
        return pd.DataFrame()
    
    n_items = max(1, int(len(df_ds_results) * top_percent))
    return df_ds_results.head(n_items)

In [32]:
I1_top20_ds = get_top_percent_by_ds(I1_df_ds, 0.20)

print(f"Top 20% items using DS for 1st target item ({I1_id})")
print(f"Total items: {len(I1_df_ds)}")
print(f"Top 20% counts: {len(I1_top20_ds)}")
I1_top20_ds

Top 20% items using DS for 1st target item (B00PCSVODW)
Total items: 526
Top 20% counts: 105


Unnamed: 0,item-id,similarity,common_users,avg_rating,DF,DS
155,B00466HN86,1.0,1,4.86,1.0,4.8567
88,B00005MKOL,1.0,1,4.76,1.0,4.7595
46,B000CCW2VQ,1.0,1,4.76,1.0,4.7586
65,B000UY6OU6,1.0,1,4.74,1.0,4.7444
161,B00068NVMK,1.0,1,4.73,1.0,4.7262
...,...,...,...,...,...,...
72,B014I67SGU,1.0,1,3.69,1.0,3.6929
100,B005LAII94,1.0,1,3.69,1.0,3.6875
110,B000BBOUDG,1.0,1,3.68,1.0,3.6829
75,B00O20UHDO,1.0,1,3.67,1.0,3.6667


In [33]:
I2_top20_ds = get_top_percent_by_ds(I2_df_ds, 0.20)

print(f"Top 20% items using DS for 2nd target item({I2_id})")
print(f"Total items: {len(I2_df_ds)}")
print(f"Top 20% counts: {len(I2_top20_ds)}")
I2_top20_ds

Top 20% items using DS for 2nd target item(B005GISDXW)
Total items: 480
Top 20% counts: 96


Unnamed: 0,item-id,similarity,common_users,avg_rating,DF,DS
29,6304500769,1.0,1,4.80,1.0,4.8039
113,B00440FO6M,1.0,1,4.77,1.0,4.7717
119,B005LOBWA2,1.0,1,4.77,1.0,4.7671
37,B00TPL8DXQ,1.0,1,4.73,1.0,4.7313
116,6301175239,1.0,1,4.72,1.0,4.7182
...,...,...,...,...,...,...
59,B00G6SO7A4,1.0,1,3.52,1.0,3.5195
35,B00005JM7T,1.0,1,3.52,1.0,3.5177
83,B003XFN1Z0,1.0,1,3.48,1.0,3.4750
1,B000059PSE,1.0,1,3.43,1.0,3.4342


# Use Top 20% Items by DS for Updated Rating Predictions

In [39]:
I1_predictions_ds = []
for user_id in I1_users_to_predict_ds[:20]:
    pred = predict_rating_item_based(user_id, I1_id, I1_top20_ds, item_means_dict, user_item_ratings)
    I1_predictions_ds.append({
        'user-id': user_id,
        'item-id': I1_id,
        'predicted_rating': pred
    })

I1_predictions_ds_df = pd.DataFrame(I1_predictions_ds)

print(f"predicted ratings for 1st target item selected by DS({I1_id})")
print(f"Number of predictions: {len(I1_predictions_ds_df)}")
print(f"Mean predicted rating: {I1_predictions_ds_df['predicted_rating'].mean():.2f}")
print(f"Std of predictions: {I1_predictions_ds_df['predicted_rating'].std():.2f}")
I1_predictions_ds_df.head(10)

predicted ratings for 1st target item selected by DS(B00PCSVODW)
Number of predictions: 20
Mean predicted rating: 1.00
Std of predictions: 0.00


Unnamed: 0,user-id,item-id,predicted_rating
0,A13H94ACZ45D07,B00PCSVODW,1
1,A2X1KK7IM7BA8A,B00PCSVODW,1
2,A31S41WISP6QAH,B00PCSVODW,1
3,A3BKODF178BPCV,B00PCSVODW,1
4,A2SGZTPTETK9TA,B00PCSVODW,1
5,AUWPA25RWPK07,B00PCSVODW,1
6,A1CFHSU2FQU8RC,B00PCSVODW,1
7,A3VDCZ2Z4SZCOT,B00PCSVODW,1
8,A31LS5N5M2NV9R,B00PCSVODW,1
9,A268A6S63M70ZM,B00PCSVODW,1


In [40]:
I2_predictions_ds = []
for user_id in I2_users_to_predict_ds[:20]:
    pred = predict_rating_item_based(user_id, I2_id, I2_top20_ds, item_means_dict, user_item_ratings)
    I2_predictions_ds.append({
        'user-id': user_id,
        'item-id': I2_id,
        'predicted_rating': pred
    })

I2_predictions_ds_df = pd.DataFrame(I2_predictions_ds)

print(f"predicted ratings for 2nd target item selected by DS({I2_id})")
print(f"Number of predictions: {len(I2_predictions_ds_df)}")
print(f"Mean predicted rating: {I2_predictions_ds_df['predicted_rating'].mean():.2f}")
print(f"Std of predictions: {I2_predictions_ds_df['predicted_rating'].std():.2f}")
I2_predictions_ds_df.head(20)

predicted ratings for 2nd target item selected by DS(B005GISDXW)
Number of predictions: 20
Mean predicted rating: 1.00
Std of predictions: 0.00


Unnamed: 0,user-id,item-id,predicted_rating
0,A35ZJY01RQSY5O,B005GISDXW,1
1,A2X1KK7IM7BA8A,B005GISDXW,1
2,A2B6RPPFW0PC7T,B005GISDXW,1
3,A268A6S63M70ZM,B005GISDXW,1
4,A2908FSWRQRFEY,B005GISDXW,1
5,A3LO76VZJYTE60,B005GISDXW,1
6,A4N1VU60B7DGR,B005GISDXW,1
7,A38INFPI2DAUE8,B005GISDXW,1
8,A1I87KEHTUIW7W,B005GISDXW,1
9,A1P6RQKZ880ZQD,B005GISDXW,1


# Case Study 2

# Compute Cosine similarity using raw ratings (no mean-centering)

In [None]:
item_user_raw_ratings = dataset.groupby(['item-id', 'user-id'])['rating'].mean().reset_index()

def Item_Based_CF_RCS(target_item_id, item_user_ratings_df, item_users_dict):
    """
    Compute cosine similarity using RAW ratings (no mean-centering).
    """
    target_users = item_users_dict.get(target_item_id, set())
    
    if len(target_users) == 0:
        return pd.DataFrame()
    
    candidate_items = set()
    for item_id, users in item_users_dict.items():
        if item_id != target_item_id and len(target_users & users) > 0:
            candidate_items.add(item_id)
    
    print(f"{target_item_id} has {len(target_users)} raters")
    print(f"{len(candidate_items)} items with common raters")
    
    if len(candidate_items) == 0:
        return pd.DataFrame()
    
    target_ratings = item_user_ratings_df[item_user_ratings_df['item-id'] == target_item_id].set_index('user-id')['rating']
    
    similarities = []
    
    for candidate_id in candidate_items:
        candidate_ratings = item_user_ratings_df[item_user_ratings_df['item-id'] == candidate_id].set_index('user-id')['rating']
        common_users = list(set(target_ratings.index) & set(candidate_ratings.index))
        
        if len(common_users) < 1:
            continue
        
        target_vec = target_ratings.loc[common_users].values
        candidate_vec = candidate_ratings.loc[common_users].values
        
        dot_product = 0.0
        for i in range(len(target_vec)):
            dot_product += target_vec[i] * candidate_vec[i]
        
        sum_sq_target = 0.0
        for val in target_vec:
            sum_sq_target += val ** 2
        norm_target = sum_sq_target ** 0.5 
        
        sum_sq_candidate = 0.0
        for val in candidate_vec:
            sum_sq_candidate += val ** 2
        norm_candidate = sum_sq_candidate ** 0.5
        
        if norm_target > 0 and norm_candidate > 0:
            similarity = dot_product / (norm_target * norm_candidate)
        else:
            similarity = 0.0
        
        similarities.append({
            'item-id': candidate_id,
            'similarity': round(similarity, 4),
            'common_users': len(common_users)
        })
    
    return pd.DataFrame(similarities).sort_values('similarity', ascending=False)

In [44]:
Target_Item_1_similarities = Item_Based_CF_RCS(I1_id, item_user_raw_ratings, item_users)
print(f"First Target Item ({I1_id}): {len(Target_Item_1_similarities)} similar items found")
Target_Item_1_similarities.head(10)

Target item B00PCSVODW has 31 raters
Found 526 items with common raters
First Target Item (B00PCSVODW): 526 similar items found


Unnamed: 0,item-id,similarity,common_users
0,B00005JOQB,1.0,1
353,B005C4444C,1.0,1
350,B00IKM5L6I,1.0,1
349,B00005QTAU,1.0,1
348,1563452685,1.0,1
347,B000RN87CS,1.0,1
346,B00PV09C7G,1.0,1
345,B00IQ8TT0K,1.0,1
344,B00AIZ282U,1.0,1
343,B00JC5ED7A,1.0,1


In [45]:
Target_Item_2_similarities = Item_Based_CF_RCS(I2_id, item_user_raw_ratings, item_users)
print(f"2nd Target Item ({I1_id}): {len(Target_Item_2_similarities)} similar items found")
Target_Item_2_similarities.head(10)

Target item B005GISDXW has 28 raters
Found 480 items with common raters
2nd Target Item (B00PCSVODW): 480 similar items found


Unnamed: 0,item-id,similarity,common_users
0,B000JRYOPG,1.0,1
308,B0048LPRCS,1.0,1
320,B0158W4NT8,1.0,1
319,6305949638,1.0,1
318,B00AF6B22E,1.0,1
317,B002M0HOV4,1.0,1
316,B00005JL8Z,1.0,1
315,B000FO0AHO,1.0,1
314,B00000JQVY,1.0,1
313,B00O2IZPD8,1.0,1


# Top 20% by similarity

In [48]:
user_item_raw_ratings = dataset.set_index(['user-id', 'item-id'])['rating'].to_dict()

CS2_I1_top20_sim = Top_similar_20_percent_similar_items(Target_Item_1_similarities, 0.20)
CS2_I2_top20_sim = Top_similar_20_percent_similar_items(Target_Item_2_similarities, 0.20)

def predict_rating_no_mean_centering(user_id, target_item_id, similar_items_df, raw_ratings_dict):
    """
    Predict rating WITHOUT mean-centering (weighted average of raw ratings).
    """
    if len(similar_items_df) == 0:
        return 3.0
    
    numerator = 0
    denominator = 0
    
    for _, row in similar_items_df.iterrows():
        similar_item_id = row['item-id']
        similarity = row['similarity']
        
        rating = raw_ratings_dict.get((user_id, similar_item_id), None)
        
        if rating is not None:
            numerator += similarity * rating
            denominator += abs(similarity)
    
    if denominator > 0:
        prediction = numerator / denominator
        return max(1, min(5, round(prediction, 2)))
    return 3.0

CS2_I1_users = get_users_to_predict(I1_id, CS2_I1_top20_sim, dataset)
CS2_I2_users = get_users_to_predict(I2_id, CS2_I2_top20_sim, dataset)

CS2_I1_preds_sim = []
for user_id in CS2_I1_users[:50]:
    pred = predict_rating_no_mean_centering(user_id, I1_id, CS2_I1_top20_sim, user_item_raw_ratings)
    CS2_I1_preds_sim.append({'user-id': user_id, 'item-id': I1_id, 'predicted_rating': pred})
CS2_I1_preds_sim_df = pd.DataFrame(CS2_I1_preds_sim)

CS2_I2_preds_sim = []
for user_id in CS2_I2_users[:50]:
    pred = predict_rating_no_mean_centering(user_id, I2_id, CS2_I2_top20_sim, user_item_raw_ratings)
    CS2_I2_preds_sim.append({'user-id': user_id, 'item-id': I2_id, 'predicted_rating': pred})
CS2_I2_preds_sim_df = pd.DataFrame(CS2_I2_preds_sim)

# Compute DF and DS

In [54]:
def compute_df_ds(similar_items_df, item_means_dict):
    if len(similar_items_df) == 0:
        return pd.DataFrame()
    
    results = []
    
    for _, row in similar_items_df.iterrows():
        item_id = row['item-id']
        similarity = row['similarity']
        common_users = row['common_users']
        
        avg_rating = item_means_dict.get(item_id, 3.0)
        
        decision_function = similarity
        decision_score = similarity * avg_rating
        
        results.append({
            'item-id': item_id,
            'similarity': similarity,
            'common_users': common_users,
            'avg_rating': round(avg_rating, 2),
            'DF': round(decision_function, 4),
            'DS': round(decision_score, 4)
        })
    
    return pd.DataFrame(results).sort_values('DS', ascending=False)

# Predict Missing Items

In [59]:
if not CS2_I1_preds_sim_df.empty:
    print(f"  I1: Mean={CS2_I1_preds_sim_df['predicted_rating'].mean():.2f}, Std={CS2_I1_preds_sim_df['predicted_rating'].std():.2f}")
if not CS2_I2_preds_sim_df.empty:
    print(f"  I2: Mean={CS2_I2_preds_sim_df['predicted_rating'].mean():.2f}, Std={CS2_I2_preds_sim_df['predicted_rating'].std():.2f}")

I1_ds = compute_df_ds(Target_Item_1_similarities, item_means_dict)
I2_ds = compute_df_ds(Target_Item_2_similarities, item_means_dict)

I1_top20_ds = get_top_percent_by_ds(I1_ds)
I2_top20_ds = get_top_percent_by_ds(I2_ds)

  I1: Mean=3.83, Std=1.42
  I2: Mean=4.12, Std=1.25


In [60]:
I1_ds

Unnamed: 0,item-id,similarity,common_users,avg_rating,DF,DS
304,B00ZE500TM,1.0000,1,4.97,1.0000,4.9714
330,B0027VST2Q,1.0000,1,4.91,1.0000,4.9077
170,B001TOD6N4,1.0000,1,4.90,1.0000,4.8992
208,B00D7AM2PQ,1.0000,1,4.90,1.0000,4.8958
11,B00S4YGWAQ,1.0000,1,4.87,1.0000,4.8702
...,...,...,...,...,...,...
17,B000URDEAG,1.0000,1,2.20,1.0000,2.2000
492,B00KVFHBLK,0.9701,2,2.21,0.9701,2.1462
154,B00008G640,1.0000,1,2.07,1.0000,2.0652
398,B001JQHT6M,1.0000,1,2.03,1.0000,2.0282


In [61]:
I1_top20_ds

Unnamed: 0,item-id,similarity,common_users,avg_rating,DF,DS
304,B00ZE500TM,1.0,1,4.97,1.0,4.9714
330,B0027VST2Q,1.0,1,4.91,1.0,4.9077
170,B001TOD6N4,1.0,1,4.90,1.0,4.8992
208,B00D7AM2PQ,1.0,1,4.90,1.0,4.8958
11,B00S4YGWAQ,1.0,1,4.87,1.0,4.8702
...,...,...,...,...,...,...
65,B002ZG9774,1.0,1,4.55,1.0,4.5530
266,B000051SHA,1.0,1,4.55,1.0,4.5493
427,B00X7SIALI,1.0,1,4.54,1.0,4.5382
159,B0092QDMQ2,1.0,1,4.54,1.0,4.5379


In [62]:
I2_ds

Unnamed: 0,item-id,similarity,common_users,avg_rating,DF,DS
211,B00MU1YEWW,1.0000,1,4.92,1.0000,4.9167
206,B000PMFO3Q,1.0000,1,4.89,1.0000,4.8922
397,B018TNBX50,1.0000,1,4.88,1.0000,4.8800
433,6300215695,1.0000,1,4.86,1.0000,4.8608
108,B001UHOWX8,1.0000,1,4.86,1.0000,4.8576
...,...,...,...,...,...,...
137,B019JNOEQO,1.0000,1,2.03,1.0000,2.0337
246,B00JWS9FYS,1.0000,1,2.02,1.0000,2.0241
310,B00KDF2NL6,1.0000,1,2.01,1.0000,2.0142
464,B00ZL4Q7NE,0.9075,4,2.04,0.9075,1.8468


In [63]:
I2_top20_ds

Unnamed: 0,item-id,similarity,common_users,avg_rating,DF,DS
211,B00MU1YEWW,1.0,1,4.92,1.0,4.9167
206,B000PMFO3Q,1.0,1,4.89,1.0,4.8922
397,B018TNBX50,1.0,1,4.88,1.0,4.8800
433,6300215695,1.0,1,4.86,1.0,4.8608
108,B001UHOWX8,1.0,1,4.86,1.0,4.8576
...,...,...,...,...,...,...
265,B0024396EW,1.0,1,4.57,1.0,4.5700
220,B0018CWEVW,1.0,1,4.57,1.0,4.5667
163,B00CWZUOV6,1.0,1,4.56,1.0,4.5635
346,B01AYMA95Y,1.0,1,4.56,1.0,4.5556


# Top 20% by DS

In [71]:
I1_users_ds = get_users_to_predict(I1_id, I1_top20_ds, dataset)
I1_preds_ds = []
for user_id in I1_users_ds[:50]:
    pred = predict_rating_no_mean_centering(user_id, I1_id, I1_top20_ds, user_item_raw_ratings)
    I1_preds_ds.append({'user-id': user_id, 'item-id': I1_id, 'predicted_rating': pred})
I1_preds_ds_df = pd.DataFrame(I1_preds_ds)


In [72]:
I2_users_ds = get_users_to_predict(I2_id, I2_top20_ds, dataset)
I2_preds_ds = []
for user_id in I2_users_ds[:50]:
    pred = predict_rating_no_mean_centering(user_id, I2_id, I2_top20_ds, user_item_raw_ratings)
    I2_preds_ds.append({'user-id': user_id, 'item-id': I2_id, 'predicted_rating': pred})
I2_preds_ds_df = pd.DataFrame(I2_preds_ds)

# Use for Updated Rating Predictions

In [75]:
print(f"  I1: Mean={I1_preds_ds_df['predicted_rating'].mean():.2f}, Std={I1_preds_ds_df['predicted_rating'].std():.2f}")
print(f"  I2: Mean={I2_preds_ds_df['predicted_rating'].mean():.2f}, Std={I2_preds_ds_df['predicted_rating'].std():.2f}")

  I1: Mean=4.47, Std=1.14
  I2: Mean=4.54, Std=1.03


In [77]:
I1_preds_ds_df.head(20)

Unnamed: 0,user-id,item-id,predicted_rating
0,A13H94ACZ45D07,B00PCSVODW,5.0
1,A1A4D13OKB9IQ2,B00PCSVODW,5.0
2,A3BKODF178BPCV,B00PCSVODW,5.0
3,A2SGZTPTETK9TA,B00PCSVODW,5.0
4,A2QE21MWUXNWQF,B00PCSVODW,4.0
5,A31V2WGUOSZ0RN,B00PCSVODW,5.0
6,A12XTUQOZHDXEX,B00PCSVODW,1.0
7,A31LS5N5M2NV9R,B00PCSVODW,5.0
8,AVPF648EJJMUW,B00PCSVODW,5.0
9,A36CQX6VHHDZJL,B00PCSVODW,5.0


In [78]:
I2_preds_ds_df.head(20)

Unnamed: 0,user-id,item-id,predicted_rating
0,A35ZJY01RQSY5O,B005GISDXW,5.0
1,A1A4D13OKB9IQ2,B005GISDXW,5.0
2,A31S41WISP6QAH,B005GISDXW,5.0
3,AV59T7O044ZAW,B005GISDXW,5.0
4,AE67JM1D32DRZ,B005GISDXW,5.0
5,A31V2WGUOSZ0RN,B005GISDXW,5.0
6,A3DMFX35UUZS1T,B005GISDXW,5.0
7,A1G01AXL9HLT3G,B005GISDXW,1.0
8,A31LS5N5M2NV9R,B005GISDXW,5.0
9,A29C1NPC9894ER,B005GISDXW,5.0
