In [31]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, mean_absolute_error

## 数据准备与预处理

In [32]:
import pandas as pd

# 导入数据
data_path = 'Movie_Ratings.csv'
df = pd.read_csv(data_path)

# 查看数据结构和前几行
print(df.head())
print(df.info())


     Unnamed: 0  Patrick C  Heather  Bryan  Patrick T  Thomas  aaron  vanessa  \
0         Alien        NaN      NaN    2.0        NaN     5.0    4.0      NaN   
1        Avatar        4.0      5.0    5.0        4.0     2.0    NaN      4.0   
2  Blade Runner        5.0      NaN    NaN        NaN     5.0    4.0      NaN   
3    Braveheart        4.0      NaN    5.0        NaN     4.0    4.0      3.0   
4     Dodgeball        5.0      4.0    3.0        2.0     4.0    NaN      4.0   

   greg  brian  ...  Zak  Matt  Chris.1  Josh  Amy  Valerie  Gary  Stephen  \
0   NaN    4.0  ...  NaN   NaN      4.0   3.0  NaN      NaN   2.0      5.0   
1   3.0    NaN  ...  5.0   NaN      NaN   4.0  3.0      2.0   1.0      4.0   
2   1.0    5.0  ...  NaN   NaN      3.0   NaN  3.0      3.0   1.0      NaN   
3   4.0    4.0  ...  5.0   NaN      4.0   NaN  3.0      4.0   5.0      5.0   
4   5.0    3.0  ...  3.0   NaN      3.0   NaN  4.0      3.0   4.0      3.0   

   Jessica  Jeff  
0      NaN   4.0  
1     

In [33]:
# 查看数据结构和前几行
print(df.head())
print(df.columns)

     Unnamed: 0  Patrick C  Heather  Bryan  Patrick T  Thomas  aaron  vanessa  \
0         Alien        NaN      NaN    2.0        NaN     5.0    4.0      NaN   
1        Avatar        4.0      5.0    5.0        4.0     2.0    NaN      4.0   
2  Blade Runner        5.0      NaN    NaN        NaN     5.0    4.0      NaN   
3    Braveheart        4.0      NaN    5.0        NaN     4.0    4.0      3.0   
4     Dodgeball        5.0      4.0    3.0        2.0     4.0    NaN      4.0   

   greg  brian  ...  Zak  Matt  Chris.1  Josh  Amy  Valerie  Gary  Stephen  \
0   NaN    4.0  ...  NaN   NaN      4.0   3.0  NaN      NaN   2.0      5.0   
1   3.0    NaN  ...  5.0   NaN      NaN   4.0  3.0      2.0   1.0      4.0   
2   1.0    5.0  ...  NaN   NaN      3.0   NaN  3.0      3.0   1.0      NaN   
3   4.0    4.0  ...  5.0   NaN      4.0   NaN  3.0      4.0   5.0      5.0   
4   5.0    3.0  ...  3.0   NaN      3.0   NaN  4.0      3.0   4.0      3.0   

   Jessica  Jeff  
0      NaN   4.0  
1     

In [34]:
# 将第一列作为项目名称，其他列作为用户的评分
df.set_index('Unnamed: 0', inplace=True)
df.index.name = 'Movie'

In [35]:
# 转置数据，使其成为用户-物品矩阵
user_item_matrix = df.T

In [36]:
# 使用KNN填充缺失值
imputer = KNNImputer(n_neighbors=5)
user_item_matrix_filled = pd.DataFrame(imputer.fit_transform(user_item_matrix), index=user_item_matrix.index, columns=user_item_matrix.columns)


In [37]:
# 计算用户之间的余弦相似度
user_similarity = cosine_similarity(user_item_matrix_filled)

In [38]:
# 将相似度矩阵转换为DataFrame，便于后续处理
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix_filled.index, columns=user_item_matrix_filled.index)

print(user_similarity_df.head())

           Patrick C   Heather     Bryan  Patrick T    Thomas     aaron  \
Patrick C   1.000000  0.962423  0.960502   0.930568  0.968409  0.958182   
Heather     0.962423  1.000000  0.967554   0.938765  0.949125  0.956857   
Bryan       0.960502  0.967554  1.000000   0.958944  0.919359  0.957333   
Patrick T   0.930568  0.938765  0.958944   1.000000  0.931274  0.930444   
Thomas      0.968409  0.949125  0.919359   0.931274  1.000000  0.933432   

            vanessa      greg     brian       ben  ...       Zak      Matt  \
Patrick C  0.972070  0.949084  0.973332  0.965722  ...  0.954913  0.955020   
Heather    0.967259  0.962890  0.963833  0.951389  ...  0.970154  0.971034   
Bryan      0.948555  0.940955  0.946737  0.940985  ...  0.972729  0.956214   
Patrick T  0.933036  0.925473  0.924237  0.927611  ...  0.941079  0.939556   
Thomas     0.956296  0.940330  0.954009  0.948409  ...  0.917712  0.935420   

            Chris.1      Josh       Amy   Valerie      Gary   Stephen  \
Patrick

In [39]:
def get_recommendations(user_id, user_item_matrix, user_similarity_df, k=5):
    # 获取与目标用户相似的前k个用户
    similar_users = user_similarity_df[user_id].sort_values(ascending=False).index[1:k+1]
    
    # 收集相似用户的评分
    similar_users_ratings = user_item_matrix.loc[similar_users]
    
    # 计算推荐评分（加权平均评分）
    weights = user_similarity_df.loc[similar_users, user_id].values
    weighted_ratings = np.dot(similar_users_ratings.T, weights) / weights.sum()
    
    # 转换为DataFrame
    weighted_ratings_df = pd.DataFrame(weighted_ratings, index=user_item_matrix.columns, columns=['score'])
    
    # 去除目标用户已经评分的电影
    rated_movies = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommended_movies = weighted_ratings_df[~weighted_ratings_df.index.isin(rated_movies)]
    
    # 排序并选择前n个电影
    recommended_movies = recommended_movies.sort_values(by='score', ascending=False).head(k)
    
    return recommended_movies

# 示例：为用户'Patrick C'推荐电影
recommendations = get_recommendations('Patrick C', user_item_matrix_filled, user_similarity_df)
print(recommendations)

Empty DataFrame
Columns: [score]
Index: []


In [40]:
def evaluate_recommendations(test_set, user_item_matrix, user_similarity_df, k=5):
    actual_ratings = []
    predicted_ratings = []
    
    for user_id in test_set.index:
        recommendations = get_recommendations(user_id, user_item_matrix, user_similarity_df, k)
        for movie_id, score in recommendations.iterrows():
            actual_rating = test_set.loc[user_id, movie_id]
            if not np.isnan(actual_rating):
                actual_ratings.append(actual_rating)
                predicted_ratings.append(score['score'])
    
    rmse = mean_squared_error(actual_ratings, predicted_ratings, squared=False)
    mae = mean_absolute_error(actual_ratings, predicted_ratings)
    
    return rmse, mae

# 将数据分为训练集和测试集
train_set = user_item_matrix.sample(frac=0.8, random_state=42)
test_set = user_item_matrix.drop(train_set.index)

# 使用训练集重新计算用户相似度
imputer = KNNImputer(n_neighbors=5)
user_item_matrix_train_filled = pd.DataFrame(imputer.fit_transform(train_set), index=train_set.index, columns=train_set.columns)
user_similarity_df_train = pd.DataFrame(cosine_similarity(user_item_matrix_train_filled), index=user_item_matrix_train_filled.index, columns=user_item_matrix_train_filled.index)

# 评估推荐系统
rmse, mae = evaluate_recommendations(test_set, user_item_matrix_train_filled, user_similarity_df_train)
print(f'RMSE: {rmse}, MAE: {mae}')


KeyError: 'vanessa'