In [1]:
#  Import libraries
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

#  Load datasets
students_df = pd.read_csv("students_dataset_curated.csv")
internships_df = pd.read_csv("internships_dataset_curated.csv")

print("Students Dataset Columns:", students_df.columns.tolist())
print("Internships Dataset Columns:", internships_df.columns.tolist())

#  Basic cleaning function
def clean_text(text):
    if pd.isnull(text):
        return ""
    text = text.lower().strip()
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # remove special characters/numbers
    return text

# Apply cleaning to important columns
students_df["skills"] = students_df["skills"].apply(clean_text)
students_df["sector_interest"] = students_df["sector_interest"].apply(clean_text)

internships_df["required_skills"] = internships_df["required_skills"].apply(clean_text)
internships_df["sector"] = internships_df["sector"].apply(clean_text)

#  Combine skills data from both datasets for vectorizer training
all_skills = pd.concat([students_df["skills"], internships_df["required_skills"]])
all_sector = pd.concat([students_df["sector_interest"], internships_df["sector"]])

#  Train TF-IDF vectorizers
skills_vectorizer = TfidfVectorizer()
skills_vectorizer.fit(all_skills)

sector_vectorizer = TfidfVectorizer()
sector_vectorizer.fit(all_sector)

#  Save vectorizers as pickle files
with open("skills_vectorizer.pkl", "wb") as f:
    pickle.dump(skills_vectorizer, f)

with open("sector_vectorizer.pkl", "wb") as f:
    pickle.dump(sector_vectorizer, f)

print("Vectorizers trained & saved!")

#  Prepare cleaned internship dataset for API
internships_cleaned = internships_df.copy()
internships_cleaned["required_skills"] = internships_cleaned["required_skills"].apply(clean_text)
internships_cleaned["sector"] = internships_cleaned["sector"].apply(clean_text)

internships_cleaned.to_csv("internships_cleaned_for_api.csv", index=False)
print("internships_cleaned_for_api.csv saved!")

#  Quick check
print("\nSample internships data:")
print(internships_cleaned.head())


Students Dataset Columns: ['student_id', 'name', 'skills', 'qualification', 'location_preference', 'sector_interest', 'social_category', 'rural_background', 'past_participation']
Internships Dataset Columns: ['internship_id', 'title', 'required_skills', 'qualification_required', 'location', 'sector', 'capacity', 'priority_category']
✅ Vectorizers trained & saved!
✅ internships_cleaned_for_api.csv saved!

Sample internships data:
  internship_id                      title  \
0         I0001  Software Developer Intern   
1         I0002           Paralegal Intern   
2         I0003       Video Editing Intern   
3         I0004        Risk Analyst Intern   
4         I0005  Clinical Assistant Intern   

                                     required_skills qualification_required  \
0                                    html nodejs sql                B.Sc CS   
1  case summaries contract review drafting legal ...                    LLB   
2  figma illustrator photoshop uiux design video ... 