## 加权评分

====================> 生成dat文件开始

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np
from collections import defaultdict

# 读取 Excel 文件并处理数据
file_path = '/Users/pangmengting/Documents/会员订单数据2.xlsx'  # 请将此路径替换为实际的文件路径
df = pd.read_excel(file_path)

# 统计每个用户购买每个主题、品类和标签的次数
user_theme_count = df.groupby(['user_id', 'theme_id']).size().reset_index(name='theme_count')
user_category_count = df.groupby(['user_id', 'type_id']).size().reset_index(name='category_count')
# user_tag_count = df.groupby(['user_id', 'product_tags']).size().reset_index(name='tag_count')

# 设置权重比例
theme_weight = 0.2
category_weight = 0.2
# tag_weight = 0.2
view_weight = 0.3
favorites_weight = 0.2
time_decay_weight = 0.1


# 时间衰减因子的计算函数
def time_decay_factor(order_date, current_date):
    days_diff = (current_date - order_date).days
    return np.exp(-time_decay_weight * days_diff)


# 获取当前日期
current_date = pd.to_datetime(pd.Timestamp.now().date())

# 标准化 view_count 和 favorites ，使其值在 [0, 1] 之间。
scaler = MinMaxScaler()
df[['view_count', 'favorites']] = scaler.fit_transform(df[['view_count', 'favorites']])

# 创建字典存储用户对每个产品的评分
user_product_ratings = defaultdict(lambda: defaultdict(int))

# 加权计算用户对每个产品的评分
for _, row in df.iterrows():
    user_id = row['user_id']
    product_id = row['product_id']
    product_theme = row['theme_id']
    product_category = row['type_id']
    # product_tags = row['product_tags']
    view_count = row['view_count']
    favorites = row['favorites']
    order_date = pd.to_datetime(row['order_date'])

    theme_count = \
        user_theme_count[
            (user_theme_count['user_id'] == user_id) & (user_theme_count['theme_id'] == product_theme)][
            'theme_count'].values
    category_count = user_category_count[
        (user_category_count['user_id'] == user_id) & (user_category_count['type_id'] == product_category)][
        'category_count'].values
    # tag_count = \
    #     user_tag_count[(user_tag_count['user_id'] == user_id) & (user_tag_count['product_tags'] == product_tags)][
    #         'tag_count'].values

    theme_count = theme_count[0] if len(theme_count) > 0 else 0
    category_count = category_count[0] if len(category_count) > 0 else 0
    # tag_count = tag_count[0] if len(tag_count) > 0 else 0

    # 计算时间衰减因子
    decay_factor = time_decay_factor(order_date, current_date)

    # 假设我们使用简单的加权和来计算评分
    # weighted_score = theme_count * theme_weight + category_count * category_weight + tag_count * tag_weight
    # 加权计算用户对每个产品的评分
    # weighted_score = (theme_count * theme_weight +
    #                   category_count * category_weight +
    #                   view_count * view_weight +
    #                   favorites * favorites_weight)

    # 加入时间因子，加权评分
    weighted_score = (theme_count * theme_weight +
                      category_count * category_weight +
                      view_count * view_weight +
                      favorites * favorites_weight
                      ) * decay_factor

    user_product_ratings[user_id][product_id] = weighted_score

In [13]:
# 转换为DataFrame格式
ratings_list = []
for user_id, products in user_product_ratings.items():
    for product_id, rating in products.items():
        ratings_list.append([user_id, product_id, rating])

ratings_df = pd.DataFrame(ratings_list, columns=['user_id', 'product_id', 'rating'])

In [14]:
ratings_df.head()

Unnamed: 0,user_id,product_id,rating
0,10674015,15509,1.5
1,10674015,15505,1.50035
2,10674015,26058,1.503922
3,11365761,17107,3.000002
4,11365761,32456,3.000496


In [15]:
# 将评分归一化到1-10的范围内
scaler = MinMaxScaler(feature_range=(0, 5))
ratings_df['rating'] = scaler.fit_transform(ratings_df[['rating']])
ratings_df['rating'] = ratings_df['rating'].round(4)

# 确保 user_id 和 product_id 为整数类型
ratings_df['user_id'] = ratings_df['user_id'].astype(int)
ratings_df['product_id'] = ratings_df['product_id'].astype(int)

In [16]:
ratings_df.head()

Unnamed: 0,user_id,product_id,rating
0,10674015,15509,0.09
1,10674015,15505,0.09
2,10674015,26058,0.09
3,11365761,17107,0.22
4,11365761,32456,0.22


In [17]:
output_file_path = '../file/output3.dat'
with open(output_file_path, 'w') as f:
    for index, row in ratings_df.iterrows():
        f.write(f"{int(row['user_id'])}::{int(row['product_id'])}::{row['rating']}\n")

print(f"Data processing complete. Output saved to {output_file_path}.")

Data processing complete. Output saved to ../file/output3.dat.


====================> 生成dat文件结束

In [ ]:
import pandas as pd

# Read the Excel file
file_path = '/Users/pangmengting/Documents/会员订单数据2.xlsx'  # Change this to your actual file path
df = pd.read_excel(file_path)

# Process the data to get user_id, product_id, and count
result_df = df.groupby(['user_id', 'product_id']).size().reset_index(name='count')

# Save the result in the specified format
output_file_path = '../file/output3.dat'
with open(output_file_path, 'w') as f:
    for index, row in result_df.iterrows():
        f.write(f"{row['user_id']}::{row['product_id']}::{row['count']}\n")

print(f"Data processing complete. Output saved to {output_file_path}.")

In [None]:
# # 将评分结果保存为文件
# ratings_df.to_csv('../file/output2.dat', sep='::', index=False, header=False)

In [2]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

# Example data
data = np.array([[112], [2123], [3123], [412], [512], [63], [7], [8123], [912], [1034]])

scaler = MinMaxScaler(feature_range=(1, 10))

scaled_data = scaler.fit_transform(data)

# Print results
print("Scaled Data:\n", scaled_data)

Scaled Data:
 [[ 1.11643667]
 [ 3.3464761 ]
 [ 4.45539675]
 [ 1.44911286]
 [ 1.56000493]
 [ 1.06209956]
 [ 1.        ]
 [10.        ]
 [ 2.00357319]
 [ 2.13886151]]


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

texts = ["123 414 one", "text data two"]
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(texts)

texts2 = ["123 414 one", "12321 12312 34534"]
tfidf_matrix2 = vectorizer.fit_transform(texts2)

similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix2)
print(similarity_matrix)


[[0.33333333 0.66666667]
 [0.66666667 0.33333333]]


In [2]:
import pandas as pd

current_date = pd.to_datetime(pd.Timestamp.now().date())
current_date

Timestamp('2024-08-07 00:00:00')

In [3]:
current_date = pd.to_datetime('today')
current_date

Timestamp('2024-08-07 15:38:41.630525')