In [11]:
import pandas as pd

def dat_to_csv_with_encoding(input_file, output_file):
    """
    尝试多种编码格式读取.dat文件
    """
    # 常见的编码格式列表
    encodings = ['utf-8', 'gbk', 'gb2312', 'latin1', 'cp1252', 'iso-8859-1']
    
    for encoding in encodings:
        try:
            print(f"尝试编码格式: {encoding}")
            df = pd.read_csv(input_file, sep='::', engine='python', encoding=encoding)
            df.to_csv(output_file, index=False, encoding='utf-8')
            print(f"✓ 成功使用 {encoding} 编码转换完成！")
            return df
        except Exception as e:
            print(f"✗ {encoding} 编码失败: {str(e)[:50]}...")
            continue
    
    print("所有编码格式都失败了")
    return None



In [12]:
# 使用示例
df = dat_to_csv_with_encoding('movies.dat', 'movies.csv')

尝试编码格式: utf-8
✗ utf-8 编码失败: 'utf-8' codec can't decode byte 0xe9 in position 3...
尝试编码格式: gbk
✗ gbk 编码失败: 'gbk' codec can't decode byte 0xe9 in position 144...
尝试编码格式: gb2312
✗ gb2312 编码失败: 'gb2312' codec can't decode byte 0xe9 in position ...
尝试编码格式: latin1
✓ 成功使用 latin1 编码转换完成！


In [10]:
ra = pd.read_csv('data/ratings.csv',sep = '::', engine='python')

In [15]:
mv = pd.read_csv('data/movies.csv',sep = ',', engine='python')

In [17]:
ur = pd.read_csv('data/users.csv',sep = '::', engine='python')

In [28]:
users = pd.read_csv('data/users.csv', sep="::", engine="python",
                    names=["UserID","Gender","Age","Occupation","Zip"],encoding='latin-1')

movies = pd.read_csv("data/movies.dat", sep="::", engine="python",
                     names=["MovieID","Title","Genres"],encoding='latin-1')

ratings = pd.read_csv("data/ratings.dat", sep="::", engine="python",
                      names=["UserID","MovieID","Rating","Timestamp"],encoding='latin-1')

In [30]:
# 合并表
data = ratings.merge(movies, on="MovieID").merge(users, on="UserID")

In [37]:
data.to_csv('data.csv',index = False)

In [13]:
import pandas as pd 
df = pd.read_csv('data.csv')

In [14]:
df

Unnamed: 0,UserID,MovieID,Rating,Timestamp,Title,Genres,Gender,Age,Occupation,Zip
0,1,1193,5,978300760,One Flew Over the Cuckoo's Nest (1975),Drama,F,1,10,48067
1,1,661,3,978302109,James and the Giant Peach (1996),Animation|Children's|Musical,F,1,10,48067
2,1,914,3,978301968,My Fair Lady (1964),Musical|Romance,F,1,10,48067
3,1,3408,4,978300275,Erin Brockovich (2000),Drama,F,1,10,48067
4,1,2355,5,978824291,"Bug's Life, A (1998)",Animation|Children's|Comedy,F,1,10,48067
...,...,...,...,...,...,...,...,...,...,...
1000204,4211,3791,2,965319075,Footloose (1984),Drama,M,45,5,77662
1000205,4211,3806,3,965319138,MacKenna's Gold (1969),Western,M,45,5,77662
1000206,4211,3840,4,965319197,Pumpkinhead (1988),Horror,M,45,5,77662
1000207,4211,3766,2,965319138,Missing in Action (1984),Action|War,M,45,5,77662


In [15]:
###划分训练验证
import pandas as pd
import numpy as np
def split_train_val_by_user(df, train_ratio=0.8, max_val_samples=5):
    """
    按用户划分训练集和验证集
    - 每个用户的前n条记录作为训练集（80%）
    - 后面的作为验证集（20%，最多5条）
    
    Parameters:
    df: DataFrame - 包含用户评分数据
    train_ratio: float - 训练集比例
    max_val_samples: int - 验证集最大样本数
    
    Returns:
    train_df, val_df: DataFrame - 训练集和验证集
    """
    
    # 按用户和时间戳排序
    df_sorted = df.sort_values(['UserID', 'Timestamp']).reset_index(drop=True)
    
    train_list = []
    val_list = []
    
    # 按用户分组处理
    for user_id, user_data in df_sorted.groupby('UserID'):
        user_data = user_data.reset_index(drop=True)
        total_samples = len(user_data)
        
        if total_samples == 1:
            # 只有一条记录，全部作为训练集
            train_list.append(user_data)
        else:
            # 计算训练集大小
            train_size = max(1, int(total_samples * train_ratio))
            
            # 划分训练集和验证集
            train_samples = user_data.iloc[:train_size]
            val_samples = user_data.iloc[train_size:]
            
            # 如果验证集超过最大限制，只取最近的几条
            if len(val_samples) > max_val_samples:
                val_samples = val_samples.tail(max_val_samples)
            
            train_list.append(train_samples)
            val_list.append(val_samples)
    
    # 合并所有用户的训练集和验证集
    train_df = pd.concat(train_list, ignore_index=True) if train_list else pd.DataFrame()
    val_df = pd.concat(val_list, ignore_index=True) if val_list else pd.DataFrame()
    
    return train_df, val_df

In [16]:
pwd

'/code/ysh/finetuning'

In [17]:
# 使用示例
train_df, val_df = split_train_val_by_user(df, train_ratio=0.8, max_val_samples=5)

# 查看结果
print(f"训练集大小: {len(train_df)}")
print(f"验证集大小: {len(val_df)}")
print(f"训练集用户数: {train_df['UserID'].nunique()}")
print(f"验证集用户数: {val_df['UserID'].nunique()}")

# 显示前几条记录
print("训练集前5条:")
print(train_df.head())
print("\n验证集前5条:")
print(val_df.head())
train_df.to_parquet('data/train_df.parquet')
val_df.to_parquet('data/val_df.parquet')

训练集大小: 797758
验证集大小: 30114
训练集用户数: 6040
验证集用户数: 6040
训练集前5条:
   UserID  MovieID  Rating  Timestamp                      Title  \
0       1     3186       4  978300019   Girl, Interrupted (1999)   
1       1     1270       5  978300055  Back to the Future (1985)   
2       1     1721       4  978300055             Titanic (1997)   
3       1     1022       5  978300055          Cinderella (1950)   
4       1     2340       3  978300103      Meet Joe Black (1998)   

                         Genres Gender  Age  Occupation    Zip  
0                         Drama      F    1          10  48067  
1                 Comedy|Sci-Fi      F    1          10  48067  
2                 Drama|Romance      F    1          10  48067  
3  Animation|Children's|Musical      F    1          10  48067  
4                       Romance      F    1          10  48067  

验证集前5条:
   UserID  MovieID  Rating  Timestamp                                Title  \
0       1     2294       4  978824291                

In [18]:
##用户画像
def create_user_profile_simple(train_df):
    """
    简化版用户画像构造函数
    """
    user_profiles_list = []
    
    for user_id, user_data in train_df.groupby('UserID'):
        # 获取用户基本信息
        basic_info = user_data.iloc[0]
        
        # 收集所有电影类型
        all_genres = set()
        for genres in user_data['Genres'].dropna():
            all_genres.update(genres.split('|'))
        
        # 构造用户画像
        profile_dict = {
            'UserID': user_id,
            'Gender': basic_info['Gender'],
            'Age': basic_info['Age'],
            'Occupation': basic_info['Occupation'],
            'Zip': basic_info['Zip'],
            'RatedMovies': ';'.join([f"{row['MovieID']}:{row['Rating']}" 
                                   for _, row in user_data.iterrows()]),
            'FavoriteGenres': '|'.join(sorted(all_genres)) if all_genres else ''
        }
        
        # 转换为指定格式的字符串
        profile_str = " | ".join([
            f"UserID:{profile_dict['UserID']}",
            f"Gender:{profile_dict['Gender']}",
            f"Age:{profile_dict['Age']}",
            f"Occupation:{profile_dict['Occupation']}",
            f"Zip:{profile_dict['Zip']}",
            f"RatedMovies:{profile_dict['RatedMovies']}",
            f"FavoriteGenres:{profile_dict['FavoriteGenres']}"
        ])
        
        user_profiles_list.append({
            'UserID': user_id,
            'UserProfile': profile_str
        })
    
    return pd.DataFrame(user_profiles_list)

In [19]:
# 使用简化版本
user_profiles = create_user_profile_simple(train_df)

In [20]:
##
user_profiles

Unnamed: 0,UserID,UserProfile
0,1,UserID:1 | Gender:F | Age:1 | Occupation:10 | ...
1,2,UserID:2 | Gender:M | Age:56 | Occupation:16 |...
2,3,UserID:3 | Gender:M | Age:25 | Occupation:15 |...
3,4,UserID:4 | Gender:M | Age:45 | Occupation:7 | ...
4,5,UserID:5 | Gender:M | Age:25 | Occupation:20 |...
...,...,...
6035,6036,UserID:6036 | Gender:F | Age:25 | Occupation:1...
6036,6037,UserID:6037 | Gender:F | Age:45 | Occupation:1...
6037,6038,UserID:6038 | Gender:F | Age:56 | Occupation:1...
6038,6039,UserID:6039 | Gender:F | Age:45 | Occupation:0...


In [21]:
import pandas as pd
import numpy as np
from collections import defaultdict

def create_candidate_items(val_df, movies_df, num_random_items=45, random_state=42):
    """
    构造待选物品列表
    
    Parameters:
    val_df: DataFrame - 验证集数据
    movies_df: DataFrame - 电影数据，包含MovieID, Title, Genres列
    num_random_items: int - 随机抽取的物品数量
    random_state: int - 随机种子
    
    Returns:
    candidate_items: DataFrame - 待选物品列表
    """
    
    # 设置随机种子
    np.random.seed(random_state)
    
    # 创建电影信息字典，方便快速查找
    movie_info_dict = {}
    for _, row in movies_df.iterrows():
        movie_info_dict[row['MovieID']] = {
            'Title': row['Title'] if pd.notna(row['Title']) else '',
            'Genres': row['Genres'] if pd.notna(row['Genres']) else ''
        }
    
    candidate_list = []
    
    # 按用户分组处理
    for user_id, user_data in val_df.groupby('UserID'):
        # 获取用户在验证集中交互过的物品
        interacted_items = user_data['MovieID'].tolist()
        
        # 获取这些物品的详细信息（MovieID:Genres格式）
        interacted_items_with_genres = []
        for movie_id in interacted_items:
            if movie_id in movie_info_dict:
                genres = movie_info_dict[movie_id]['Genres']
                interacted_items_with_genres.append(f"{movie_id}:{genres}")
        
        # 从所有电影中随机抽取45个物品（排除用户已经交互过的物品）
        all_movie_ids = set(movies_df['MovieID'].unique())
        interacted_movie_ids = set(interacted_items)
        available_movie_ids = list(all_movie_ids - interacted_movie_ids)
        
        # 如果可用电影数量少于需要的数量，就取所有可用的
        sample_size = min(num_random_items, len(available_movie_ids))
        random_items = np.random.choice(available_movie_ids, size=sample_size, replace=False)
        
        # 获取随机物品的详细信息
        random_items_with_genres = []
        for movie_id in random_items:
            if movie_id in movie_info_dict:
                genres = movie_info_dict[movie_id]['Genres']
                random_items_with_genres.append(f"{movie_id}:{genres}")
        
        # 合并交互过的物品和随机物品，总共50个
        all_candidate_items = interacted_items_with_genres + random_items_with_genres
        
        # 如果总数超过50个，截取前50个；如果不足50个，保持现状
        if len(all_candidate_items) > 50:
            all_candidate_items = all_candidate_items[:50]
        
        # 构造候选物品字符串
        candidate_items_str = ";".join(all_candidate_items)
        interacted_items_str = ";".join(interacted_items_with_genres)
        
        candidate_list.append({
            'UserID': user_id,
            'CandidateItems': candidate_items_str,
            'InteractedItems': interacted_items_str
        })
    
    # 创建候选物品DataFrame
    candidate_items = pd.DataFrame(candidate_list)
    
    return candidate_items

# 如果movies数据在train_df中，可以先提取出来
def extract_movies_from_train(train_df):
    """
    从训练数据中提取电影信息
    """
    movies_df = train_df[['MovieID', 'Title', 'Genres']].drop_duplicates(subset=['MovieID'])
    return movies_df

In [22]:
# 使用示例
movies_df = extract_movies_from_train(df)  # 或者从单独的movies文件读取
candidate_items = create_candidate_items(val_df, movies_df, num_random_items=45, random_state=42)

# 查看结果
print("候选物品列表示例:")
print(candidate_items.head())

# 保存候选物品列表
# candidate_items.to_csv('candidate_items.csv', index=False)

# 显示某个具体用户的候选物品
# print(candidate_items[candidate_items['UserID'] == 1].iloc[0])

候选物品列表示例:
   UserID                                     CandidateItems  \
0       1  2294:Animation|Children's;783:Animation|Childr...   
1       2  95:Action|Thriller;1687:Action|Thriller;434:Ac...   
2       3  1641:Comedy;3552:Comedy;104:Comedy;3868:Comedy...   
3       4  3702:Action|Sci-Fi;1240:Action|Sci-Fi|Thriller...   
4       5  2029:Comedy|Romance;3105:Drama;1485:Comedy;188...   

                                     InteractedItems  
0  2294:Animation|Children's;783:Animation|Childr...  
1  95:Action|Thriller;1687:Action|Thriller;434:Ac...  
2  1641:Comedy;3552:Comedy;104:Comedy;3868:Comedy...  
3  3702:Action|Sci-Fi;1240:Action|Sci-Fi|Thriller...  
4  2029:Comedy|Romance;3105:Drama;1485:Comedy;188...  


In [23]:
candidate_items

Unnamed: 0,UserID,CandidateItems,InteractedItems
0,1,2294:Animation|Children's;783:Animation|Childr...,2294:Animation|Children's;783:Animation|Childr...
1,2,95:Action|Thriller;1687:Action|Thriller;434:Ac...,95:Action|Thriller;1687:Action|Thriller;434:Ac...
2,3,1641:Comedy;3552:Comedy;104:Comedy;3868:Comedy...,1641:Comedy;3552:Comedy;104:Comedy;3868:Comedy...
3,4,3702:Action|Sci-Fi;1240:Action|Sci-Fi|Thriller...,3702:Action|Sci-Fi;1240:Action|Sci-Fi|Thriller...
4,5,2029:Comedy|Romance;3105:Drama;1485:Comedy;188...,2029:Comedy|Romance;3105:Drama;1485:Comedy;188...
...,...,...,...
6035,6036,3573:Horror|Sci-Fi;2643:Action|Adventure|Sci-F...,3573:Horror|Sci-Fi;2643:Action|Adventure|Sci-F...
6036,6037,1019:Adventure|Children's|Fantasy|Sci-Fi;2641:...,1019:Adventure|Children's|Fantasy|Sci-Fi;2641:...
6037,6038,1354:Drama;1419:Drama;2700:Animation|Comedy;11...,1354:Drama;1419:Drama;2700:Animation|Comedy;11...
6038,6039,1148:Animation|Comedy;912:Drama|Romance|War;12...,1148:Animation|Comedy;912:Drama|Romance|War;12...


In [24]:
##构造微调数据

def create_finetuning_data(user_profiles, candidate_items):
    """
    构造用于后续微调的数据
    
    Parameters:
    user_profiles: DataFrame - 用户画像数据
    candidate_items: DataFrame - 候选物品列表
    
    Returns:
    finetuning_data: DataFrame - 微调数据
    """
    
    # 合并用户画像和候选物品数据
    merged_data = pd.merge(user_profiles, candidate_items, on='UserID', how='inner')
    
    finetuning_list = []
    
    for _, row in merged_data.iterrows():
        user_id = row['UserID']
        user_profile = row['UserProfile']
        candidate_items_str = row['CandidateItems']
        interacted_items = row['InteractedItems']
        
        # 构造instruction
        instruction = (f"Filter 5 items that the user may be interested in from candidate items "
                      f"based on the user's personal profile.+UserID: {user_id}, "
                      f"UserProfile: {user_profile}, CandidateItems: {candidate_items_str}")
        
        # 构造output
        output = f"interested items: {interacted_items}"
        
        # 构造text
        text = (f"Below is an instruction that describes a task. "
                f"Write a response that appropriately completes the request.\n\n"
                f"### Instruction:\n{instruction}\n\n"
                f"### Response:\n{output}")
        
        finetuning_list.append({
            'instruction': instruction,
            'input': '',  # input列为空
            'output': output,
            'text': text
        })
    
    # 创建微调数据DataFrame
    finetuning_data = pd.DataFrame(finetuning_list)
    
    return finetuning_data

In [25]:
# 使用示例
finetuning_data = create_finetuning_data(user_profiles, candidate_items)

# 查看结果
print("微调数据示例:")
print(finetuning_data.head())

# 保存微调数据
# finetuning_data.to_csv('finetuning_data.csv', index=False)

# # 显示一条完整的微调数据示例
# print("Instruction:")
# print(finetuning_data.iloc[0]['instruction'])
# print("\nOutput:")
# print(finetuning_data.iloc[0]['output'])
# print("\nText:")
# print(finetuning_data.iloc[0]['text'])

微调数据示例:
                                         instruction input  \
0  Filter 5 items that the user may be interested...         
1  Filter 5 items that the user may be interested...         
2  Filter 5 items that the user may be interested...         
3  Filter 5 items that the user may be interested...         
4  Filter 5 items that the user may be interested...         

                                              output  \
0  interested items: 2294:Animation|Children's;78...   
1  interested items: 95:Action|Thriller;1687:Acti...   
2  interested items: 1641:Comedy;3552:Comedy;104:...   
3  interested items: 3702:Action|Sci-Fi;1240:Acti...   
4  interested items: 2029:Comedy|Romance;3105:Dra...   

                                                text  
0  Below is an instruction that describes a task....  
1  Below is an instruction that describes a task....  
2  Below is an instruction that describes a task....  
3  Below is an instruction that describes a task....  
4  Belo

In [26]:
finetuning_data.to_parquet('data/finetuning_data.parquet')

In [27]:
import pandas as pd 
finetuning_data = pd.read_parquet('data/finetuning_data.parquet')

In [28]:
finetuning_data['instruction'][123]

"Filter 5 items that the user may be interested in from candidate items based on the user's personal profile.+UserID: 124, UserProfile: UserID:124 | Gender:M | Age:56 | Occupation:7 | Zip:91356 | RatedMovies:902:4;1544:3;1343:4;3101:4;3107:4;2302:4;527:4;1650:5;2858:3;457:3;3418:5;1358:5;3078:5;1693:4;678:5;531:4;2912:4;1213:4 | FavoriteGenres:Action|Adventure|Children's|Comedy|Crime|Drama|Romance|Sci-Fi|Thriller|War, CandidateItems: 1094:Drama|Romance|War;34:Children's|Comedy|Drama;1179:Crime|Drama|Film-Noir;3125:Drama;3298:Drama;503:Drama;1653:Drama|Sci-Fi|Thriller;1520:Romance;930:Film-Noir|Romance|Thriller;2821:Adventure|Drama;3167:Drama;2153:Action|Adventure;904:Mystery|Thriller;1950:Drama|Mystery;1047:Action|Thriller;1342:Horror;1821:Comedy|Romance;1477:Romance;2692:Action|Crime|Romance;3857:Thriller;2010:Sci-Fi;940:Action|Adventure;2167:Action|Adventure|Horror;3351:Horror;1910:Action|Comedy|Crime;734:Comedy;3894:Drama;618:Comedy|Romance;3747:Drama;2008:Crime|Drama|Film-Noir;429:

In [29]:
finetuning_data.iloc[123]

instruction    Filter 5 items that the user may be interested...
input                                                           
output         interested items: 1094:Drama|Romance|War;34:Ch...
text           Below is an instruction that describes a task....
Name: 123, dtype: object

In [30]:
finetuning_data.iloc[123]['instruction']

"Filter 5 items that the user may be interested in from candidate items based on the user's personal profile.+UserID: 124, UserProfile: UserID:124 | Gender:M | Age:56 | Occupation:7 | Zip:91356 | RatedMovies:902:4;1544:3;1343:4;3101:4;3107:4;2302:4;527:4;1650:5;2858:3;457:3;3418:5;1358:5;3078:5;1693:4;678:5;531:4;2912:4;1213:4 | FavoriteGenres:Action|Adventure|Children's|Comedy|Crime|Drama|Romance|Sci-Fi|Thriller|War, CandidateItems: 1094:Drama|Romance|War;34:Children's|Comedy|Drama;1179:Crime|Drama|Film-Noir;3125:Drama;3298:Drama;503:Drama;1653:Drama|Sci-Fi|Thriller;1520:Romance;930:Film-Noir|Romance|Thriller;2821:Adventure|Drama;3167:Drama;2153:Action|Adventure;904:Mystery|Thriller;1950:Drama|Mystery;1047:Action|Thriller;1342:Horror;1821:Comedy|Romance;1477:Romance;2692:Action|Crime|Romance;3857:Thriller;2010:Sci-Fi;940:Action|Adventure;2167:Action|Adventure|Horror;3351:Horror;1910:Action|Comedy|Crime;734:Comedy;3894:Drama;618:Comedy|Romance;3747:Drama;2008:Crime|Drama|Film-Noir;429:

In [31]:
finetuning_data.iloc[123]['output']

"interested items: 1094:Drama|Romance|War;34:Children's|Comedy|Drama;1179:Crime|Drama|Film-Noir;3125:Drama;3298:Drama"

In [32]:
finetuning_data.iloc[432]['instruction']

"Filter 5 items that the user may be interested in from candidate items based on the user's personal profile.+UserID: 433, UserProfile: UserID:433 | Gender:M | Age:50 | Occupation:6 | Zip:55115 | RatedMovies:527:4;1096:5;21:4;3072:3;2736:3;3421:4;2858:4;2289:3;1148:4;1294:4;3543:5;1394:5;1136:3;1230:5;1304:5;1276:5;3210:5;1288:5;3751:4;1965:4;1223:4;377:3;3614:3;471:3;1393:3;25:2;708:3;361:5;356:4;788:2;11:3;1777:3;339:5;539:5;1569:4;2671:1 | FavoriteGenres:Action|Animation|Children's|Comedy|Drama|Fantasy|Musical|Romance|Sci-Fi|Thriller|War|Western, CandidateItems: 804:Comedy|Romance;3004:Comedy|Romance;3:Comedy|Romance;1353:Comedy|Romance;3705:Action|Adventure|Romance|Thriller;2652:Horror;1572:Drama;3190:Adventure|Sci-Fi;2794:Comedy;1231:Drama;2166:Drama|Romance;793:Drama;3540:Romance|Thriller;2581:Comedy|Romance;998:Action|Crime;544:Action;2593:Comedy;310:Comedy;2428:Horror|Sci-Fi;1029:Animation|Children's|Musical;1981:Horror;1228:Drama;1381:Comedy|Musical|Romance;2204:Thriller;352:C