# Load Data dan Import Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import files
files.upload()

# Load data dari CSV hasil scraping
df = pd.read_csv('HandsOn1_coursera.csv')

Saving HandsOn1_coursera.csv to HandsOn1_coursera.csv


In [4]:
# Cek data awal

print(df.head())

                Distinct Skills    source            Processed_Skills
0  Git (Version Control System)  coursera  git version control system
1               Threat Modeling  coursera                threat model
2            Quality Management  coursera               qualiti manag
3        Engineering Management  coursera                 engin manag
4          Process Optimization  coursera               process optim


In [5]:
# Pake kolom 'Processed_Skills' karena sudah dibersihkan
skills_text = df['Processed_Skills'].fillna('').tolist()
original_skills = df['Distinct Skills'].tolist()

# Text Vectorization Pakai TF-IDF

In [6]:
# max_features=1000 untuk batasi dimensi, stop_words='english' untuk buang kata umum
tfidf = TfidfVectorizer(
    max_features=1000,
    stop_words='english',
    ngram_range=(1, 2),  # unigram dan bigram
    lowercase=True
)

In [8]:
# Transform text menjadi vector
tfidf_matrix = tfidf.fit_transform(skills_text)
print(f"Shape matrix TF-IDF: {tfidf_matrix.shape}")

Shape matrix TF-IDF: (183, 391)


# Similarity Metrics dan Mapping

In [9]:
# Hitung cosine similarity antar skills
similarity_matrix = cosine_similarity(tfidf_matrix)
print(f"Shape similarity matrix: {similarity_matrix.shape}")

📐 Shape similarity matrix: (183, 183)


# Recommendation Modelling

In [10]:
def get_skill_recommendations(skill_name, top_n=5):
    try:
        # Cari index skill dalam dataset
        if skill_name in original_skills:
            skill_idx = original_skills.index(skill_name)
        else:
            # Kalau ga exact match, cari yang paling mirip
            skill_lower = skill_name.lower()
            possible_matches = [i for i, skill in enumerate(original_skills)
                             if skill_lower in skill.lower()]
            if possible_matches:
                skill_idx = possible_matches[0]
                print(f"⚠️ Skill '{skill_name}' tidak ditemukan exact match.")
                print(f"   Menggunakan: '{original_skills[skill_idx]}'")
            else:
                return []

        # Ambil similarity scores untuk skill ini
        sim_scores = similarity_matrix[skill_idx]

        # Buat list (index, similarity_score) dan sort berdasarkan score
        sim_scores_indexed = [(i, score) for i, score in enumerate(sim_scores)]
        sim_scores_indexed.sort(key=lambda x: x[1], reverse=True)

        # Ambil top_n+1 (karena index 0 adalah skill itu sendiri)
        top_indices = sim_scores_indexed[1:top_n+1]

        # Buat list rekomendasi
        recommendations = []
        for idx, score in top_indices:
            recommendations.append({
                'skill': original_skills[idx],
                'similarity_score': round(score, 4),
                'processed_skill': skills_text[idx]
            })

        return recommendations

    except Exception as e:
        print(f"Error: {e}")
        return []

# Testing

In [12]:
test_skills = ['Python Programming', 'Machine Learning', 'Data Science', 'Cybersecurity']

for skill in test_skills:
    print(f"\n* Rekomendasi untuk '{skill}':")
    recommendations = get_skill_recommendations(skill, top_n=3)

    if recommendations:
        for i, rec in enumerate(recommendations, 1):
            print(f"   {i}. {rec['skill']} (similarity: {rec['similarity_score']})")
    else:
        print("Tidak ada rekomendasi ditemukan")


* Rekomendasi untuk 'Python Programming':
   1. Programming Principles (similarity: 0.2581)
   2. Loyalty Programs (similarity: 0.2581)
   3. Object Oriented Programming (OOP) (similarity: 0.1637)

* Rekomendasi untuk 'Machine Learning':
   1. Machine Learning Software (similarity: 0.7069)
   2. Machine Learning Algorithms (similarity: 0.6754)
   3. Machine Learning Methods (similarity: 0.6618)

* Rekomendasi untuk 'Data Science':
   1. Data Analysis (similarity: 0.1908)
   2. Data Visualization (similarity: 0.1871)
   3. Data Security (similarity: 0.1856)

* Rekomendasi untuk 'Cybersecurity':
   1. Git (Version Control System) (similarity: 0.0)
   2. Threat Modeling (similarity: 0.0)
   3. Quality Management (similarity: 0.0)


# Save to csv

In [15]:
# Save feature names dari TF-IDF
feature_names = tfidf.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)
tfidf_df['original_skill'] = original_skills
tfidf_df['processed_skill'] = skills_text

# Save TF-IDF vectors
tfidf_df.to_csv('tfidf_vectors.csv', index=False)
print("file disimpan ke 'tfidf_vectors.csv'")

file disimpan ke 'tfidf_vectors.csv'


In [16]:
# Save similarity matrix
similarity_df = pd.DataFrame(similarity_matrix,
                           index=original_skills,
                           columns=original_skills)
similarity_df.to_csv('similarity_matrix.csv')
print("file disimpan ke 'similarity_matrix.csv'")

file disimpan ke 'similarity_matrix.csv'


In [17]:
# Ini buat CSV yang isinya semua rekomendasi untuk setiap skill
all_recommendations = []

for i, skill in enumerate(original_skills):
    recommendations = get_skill_recommendations(skill, top_n=5)
    for j, rec in enumerate(recommendations):
        all_recommendations.append({
            'source_skill': skill,
            'recommended_skill': rec['skill'],
            'similarity_score': rec['similarity_score'],
            'rank': j + 1
        })

recommendations_df = pd.DataFrame(all_recommendations)
recommendations_df.to_csv('skill_recommendations.csv', index=False)
print("file disimpan ke 'skill_recommendations.csv'")

file disimpan ke 'skill_recommendations.csv'
