In [1]:
import csv
import re

def extract_skills_from_description(description):
    """
    从项目描述中提取技能标签，仅使用指定的技能分类
    """
    # 技能关键词映射到指定分类中的技能
    skill_keyword_mapping = {
        # Programming Language
        'python': 'Python',
        'java': 'Java', 
        'javascript': 'JavaScript',
        'c\+\+': 'C++',
        'c#': 'C#',
        'ruby': 'Ruby',
        'go': 'Go',
        'rust': 'Rust',
        'php': 'PHP',
        'swift': 'Swift',
        
        # Web Development
        'react': 'React',
        'vue': 'Vue.js',
        'angular': 'Angular',
        'node': 'Node.js',
        'django': 'Django',
        'flask': 'Flask',
        'spring': 'Spring',
        'express': 'Express.js',
        'html': 'HTML',
        'css': 'CSS',
        
        # Database & Cloud
        'sql': 'SQL',
        'mysql': 'MySQL',
        'postgresql': 'PostgreSQL',
        'mongodb': 'MongoDB',
        'redis': 'Redis',
        'aws': 'AWS',
        'docker': 'Docker',
        'kubernetes': 'Kubernetes',
        'azure': 'Azure',
        'gcp': 'GCP',
        'cloud': 'AWS',
        
        # Data Science
        'machine learning': 'Machine Learning',
        'tensorflow': 'TensorFlow',
        'pytorch': 'PyTorch',
        'data analysis': 'Data Analysis',
        'pandas': 'Pandas',
        'numpy': 'NumPy',
        'r language': 'R',
        'tableau': 'Tableau',
        'artificial intelligence': 'Machine Learning',
        'data analytics': 'Data Analysis',
        
        # DevOps & Tools
        'git': 'Git',
        'jenkins': 'Jenkins',
        'ci/cd': 'CI/CD',
        'linux': 'Linux',
        'bash': 'Bash',
        'rest api': 'REST API',
        'graphql': 'GraphQL',
        'microservices': 'Microservices',
        'devops': 'CI/CD',
        'api': 'REST API',
    }
    
    # 提取技能
    skills = set()
    description_lower = description.lower()
    
    # 根据关键词匹配技能
    for keyword, skill in skill_keyword_mapping.items():
        if re.search(r'\b' + keyword + r'\b', description_lower):
            skills.add(skill)
    
    # 基于项目类型推断额外技能
    if any(word in description_lower for word in ['web', 'platform', 'application', 'dashboard', 'ui', 'ux']):
        skills.add('JavaScript')
        if len(skills) < 6:  # 确保不超过6个技能
            skills.add('React')
    
    if any(word in description_lower for word in ['data', 'analytics', 'analysis', 'reporting']):
        skills.add('Python')
        if len(skills) < 6:
            skills.add('Data Analysis')
    
    if any(word in description_lower for word in ['cloud', 'infrastructure', 'deployment', 'scalable']):
        skills.add('AWS')
        if len(skills) < 6:
            skills.add('Docker')
    
    if any(word in description_lower for word in ['ai', 'machine learning', 'predictive', 'intelligent']):
        skills.add('Python')
        if len(skills) < 6:
            skills.add('Machine Learning')
    
    if any(word in description_lower for word in ['mobile', 'ios', 'android']):
        skills.add('JavaScript')
        if len(skills) < 6:
            skills.add('React')
    
    return list(skills)[:6]  # 返回最多6个技能

# 读取原始CSV文件并生成新的CSV
def generate_skillset_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:
        
        reader = csv.reader(infile)
        writer = csv.writer(outfile)
        
        # 写入表头
        writer.writerow(['description', 'required_skillsets'])
        
        # 跳过表头
        next(reader, None)
        
        # 处理每一行
        for row in reader:
            if row:  # 确保行不为空
                description = row[0]
                skillset = extract_skills_from_description(description)
                # 将技能列表转换为字符串，用逗号分隔
                skillset_str = ', '.join(skillset)
                writer.writerow([description, skillset_str])
    
    print(f"成功生成新的CSV文件: {output_file}")

# 执行生成
input_filename = 'projects0.csv'  # 输入文件名
output_filename = 'projects_with_skillsets.csv'  # 输出文件名

generate_skillset_csv(input_filename, output_filename)

成功生成新的CSV文件: projects_with_skillsets.csv
