In [4]:
import pandas as pd
import ast
import random

In [11]:
users = pd.read_csv("../database/users.csv")
profiles = pd.read_csv("../database/students_profile.csv")
content = pd.read_csv("../database/content_items.csv")

students = users.merge(profiles, left_on="id", right_on="user_id", how="inner")

In [12]:
def parse_skills(x):
    try:
        return ast.literal_eval(x)
    except:
        return {}

skills_df = students['skills_mastery'].apply(parse_skills).apply(pd.Series)
students = pd.concat([students, skills_df], axis=1)

In [13]:
def get_weak_skills(row):
    weak = [skill for skill, value in row.items() if isinstance(value, (int, float)) and value < 0.5]
    return weak[:2]  # Pick top 2 weakest

students['weak_skills'] = students.apply(get_weak_skills, axis=1)

In [14]:
stream_map = {
    "CSE": ["coding", "machine learning", "data", "software", "AI"],
    "ECE": ["electronics", "circuits", "signals", "communication", "hardware"],
    "MECH": ["design", "mechanics", "robotics", "manufacturing"],
    "CIVIL": ["structural", "construction", "surveying", "design"],
    "CHEM": ["chemical", "process", "environmental", "safety"]
}

In [15]:
def recommend_content(student_row):
    stream = student_row['stream']
    weak_skills = student_row['weak_skills']
    goal = student_row.get('career_goal', '').lower()

    # Filter content matching the stream
    stream_keywords = stream_map.get(stream, [])
    relevant_content = content[
        content['skills_tags'].apply(lambda x: any(kw in str(x).lower() for kw in stream_keywords))
    ]

    # Further filter by weak skills
    relevant_content = relevant_content[
        relevant_content['skills_tags'].apply(lambda x: any(ws in str(x).lower() for ws in weak_skills))
    ]

    # Filter difficulty (easy/medium)
    relevant_content = relevant_content[
        relevant_content['difficulty'].isin(['easy', 'medium'])
    ]

    # Rank based on matching skills and career goal overlap
    def score(row):
        skill_match = sum(ws in str(row['skills_tags']).lower() for ws in weak_skills)
        goal_match = 1 if goal in str(row['title']).lower() or goal in str(row['description']).lower() else 0
        return skill_match + goal_match

    relevant_content['score'] = relevant_content.apply(score, axis=1)
    relevant_content = relevant_content.sort_values(by='score', ascending=False)

    # Pick top 5 (or randomize if less)
    top5 = relevant_content.head(5)
    return list(top5['title'].values)

In [17]:
students['top_5_recommendations'] = students.apply(recommend_content, axis=1)

KeyError: 'stream'

In [None]:
output = students[['user_id', 'stream', 'career_goal', 'top_5_recommendations']]
output.to_csv("phase1_recommendations.csv", index=False)