In [74]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score

# 读取电影数据
movies = pd.read_csv('ml-latest-small/movies.csv')

# 读取评级数据
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# 创建一个新的数据框，将电影和评级数据合并
data = pd.merge(movies, ratings, on='movieId')

# 使用TF-IDF向量化电影的特征
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['genres'])

# 将评级数据分成训练集和测试集
train = data.sample(frac=0.8, random_state=1)
test = data.drop(train.index)

# 训练KNN回归模型并进行交叉验证
knn = KNeighborsRegressor(n_neighbors=5)
scores = cross_val_score(knn, X, train['rating'], cv=5)

# 输出平均交叉验证精度
print('Mean CV score:', scores.mean())

ValueError: Found input variables with inconsistent numbers of samples: [100836, 80669]

In [76]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# 读取电影数据
movies = pd.read_csv('ml-latest-small/movies.csv')

# 读取评级数据
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# 将评级数据分成训练集和测试集
train_ratings = ratings.sample(frac=0.8, random_state=1)
test_ratings = ratings.drop(train_ratings.index)

# 创建一个新的数据框，将电影和评级数据合并
train_data = pd.merge(movies, train_ratings, on='movieId')
test_data = pd.merge(movies, test_ratings, on='movieId')

# 使用TF-IDF向量化电影的特征
vectorizer = TfidfVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_data['genres'])
X_test = vectorizer.transform(test_data['genres'])

# 训练KNN回归模型并进行交叉验证
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, train_data['rating'])
y_pred = knn.predict(X_test)

# 输出平均交叉验证精度和均方根误差
cv_scores = cross_val_score(knn, X_train, train_data['rating'], cv=5)
print('Mean CV score:', cv_scores.mean())
print('RMSE:', mean_squared_error(test_data['rating'], y_pred, squared=False))

Mean CV score: -0.29424166869990065
RMSE: 1.1510068548396815


In [77]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# 读取电影数据
movies = pd.read_csv('ml-latest-small/movies.csv')

# 读取评级数据
ratings = pd.read_csv('ml-latest-small/ratings.csv')

# 使用电影ID将电影和评级数据合并
data = pd.merge(movies, ratings, on='movieId')

# 使用TF-IDF向量化电影的特征
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['genres'])

# 训练KNN模型
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(X)

# 获取每个电影的邻居
neighbors = knn.kneighbors(X, n_neighbors=30, return_distance=False)

# 将邻居矩阵转换为DataFrame
neighbors_df = pd.DataFrame(neighbors, index=data.index)

# 将评级数据按用户ID分成训练集和测试集
train = ratings.sample(frac=0.8, random_state=1)
test = ratings.drop(train.index)

# 在测试集上计算精度和召回率
for n in [5, 10, 15, 20, 25, 30]:
    precision_sum = 0
    recall_sum = 0
    count = 0
    for user_id in test['userId'].unique():
        user_ratings = test[test['userId'] == user_id]
        if len(user_ratings) == 0:
            continue
        user_movies = user_ratings.sort_values('rating', ascending=False)['movieId'][:n]
        relevant_movies = set(user_movies)
        recommended_movies = set(neighbors_df.loc[user_ratings.index, :n].values.reshape(-1))
        common_movies = relevant_movies.intersection(recommended_movies)
        precision = len(common_movies) / n
        recall = len(common_movies) / len(relevant_movies)
        precision_sum += precision
        recall_sum += recall
        count += 1
    precision = precision_sum / count
    recall = recall_sum / count
    print('TOP', n)
    print('Precision:', precision)
    print('Recall:', recall)


TOP 5
Precision: 0.0
Recall: 0.0
TOP 10
Precision: 0.00016420361247947455
Recall: 0.00016420361247947455
TOP 15
Precision: 0.0004378762999452655
Recall: 0.0004378762999452655
TOP 20
Precision: 0.0006568144499178982
Recall: 0.0006568144499178982
TOP 25
Precision: 0.001050903119868637
Recall: 0.001050903119868637
TOP 30
Precision: 0.0010399562123700053
Recall: 0.0010548127296895768


In [83]:
import pandas as pd
# 加载数据
df = pd.read_csv('ml-100k/u.data', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'],
                 encoding='latin-1')
# 计算每部电影的平均评分
item_mean_rating = df.groupby('item_id')['rating'].mean()
# 根据电影内容特征，计算电影之间的相似度
from sklearn.metrics.pairwise import cosine_similarity
item_features = pd.read_csv('ml-100k/u.item', sep='|',
                            encoding='latin-1', names=['item_id', 'title', 'release_date', 'video_release_date',
                                                              'imdb_url', 'unknown', 'action', 'adventure', 'animation',
                                                              'childrens', 'comedy', 'crime', 'documentary', 'drama',
                                                              'fantasy', 'film_noir', 'horror', 'musical', 'mystery',
                                                              'romance', 'sci_fi', 'thriller', 'war', 'western'])
item_features = item_features.drop(columns=['title', 'release_date', 'video_release_date', 'imdb_url', 'unknown'])
item_similarity = cosine_similarity(item_features.values)
# 计算每个用户的推荐列表
from tqdm import tqdm
user_recs = {}
for user_id in tqdm(df['user_id'].unique()):
    user_items = df[df['user_id'] == user_id]['item_id'].values
    user_sim_items = []
    for item_id in user_items:
        sim_scores = list(enumerate(item_similarity[item_id-1]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:11]
        user_sim_items += [i[0] for i in sim_scores]
    user_sim_items = list(set(user_sim_items))
    user_recs[user_id] = [(item, item_mean_rating[item]) for item in user_sim_items]
# 推荐TOPN的precision和recall
from sklearn.metrics import precision_score, recall_score


100%|██████████| 943/943 [00:34<00:00, 27.05it/s]


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [85]:
def precision_recall(user_recs, test_df, topn):
    user_precision = []
    user_recall = []
    for user_id in user_recs.keys():
        # 取出该用户的测试集
        test_items = test_df[test_df['user_id'] == user_id]['item_id'].values
        # 取出topn的推荐列表
        rec_items = [i[0] for i in sorted(user_recs[user_id], key=lambda x: x[1], reverse=True)[:topn]]
        # 计算precision
        user_precision.append(precision_score(test_items, rec_items, average='micro'))
        # 计算recall
        user_recall.append(recall_score(test_items, rec_items, average='micro'))
    return (sum(user_precision) / len(user_precision), sum(user_recall) / len(user_recall))

# 加载测试集
test_df = pd.read_csv('ml-100k/ua.test', sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp'])
# 计算precision和recall
topn = 10
precision, recall = precision_recall(user_recs, test_df, topn)
print('topn = %d时，precision = %.3f, recall = %.3f' % (topn, precision, recall))

topn = 10时，precision = 0.002, recall = 0.002
