# EduMate - Sistem Rekomendasi Pembelajaran
# Notebook untuk Eksplorasi Data dan Pengembangan Model

In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import pickle
import warnings
warnings.filterwarnings('ignore')

## 1. LOADING DATA 

In [79]:
print("Loading datasets...")
data_mahasiswa = pd.read_csv('data_mahasiswa.csv')
data_konten = pd.read_csv('data_konten.csv') 
data_interaksi = pd.read_csv('interaksi_user_konten.csv')

print(f"✅ Data mahasiswa: {data_mahasiswa.shape}")
print(f"✅ Data konten: {data_konten.shape}")
print(f"✅ Data interaksi: {data_interaksi.shape}")

Loading datasets...
✅ Data mahasiswa: (300, 10)
✅ Data konten: (200, 8)
✅ Data interaksi: (3000, 8)


## 2. EXPLORATORY DATA ANALYSIS

In [80]:
print("\n🔍 Data Overview:")
print("\n--- Data Mahasiswa ---")
print(data_mahasiswa.info())
print(data_mahasiswa.describe())

print("\n--- Data Konten ---")
print(data_konten.info()) 
print(data_konten.describe())

print("\n--- Data Interaksi ---")
print(data_interaksi.info())
print(data_interaksi.describe())


🔍 Data Overview:

--- Data Mahasiswa ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 300 entries, 0 to 299
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id_mahasiswa            300 non-null    object 
 1   nama                    300 non-null    object 
 2   jurusan                 300 non-null    object 
 3   angkatan                300 non-null    int64  
 4   ipk_terakhir            300 non-null    float64
 5   device_preference       300 non-null    object 
 6   learning_style          300 non-null    object 
 7   goal                    300 non-null    object 
 8   waktu_belajar_per_hari  300 non-null    int64  
 9   ketersediaan_belajar    300 non-null    object 
dtypes: float64(1), int64(2), object(7)
memory usage: 23.6+ KB
None
         angkatan  ipk_terakhir  waktu_belajar_per_hari
count   300.00000    300.000000               300.00000
mean   2021.06000      2.997700      

## 3. DATA PREPROCESSING

In [81]:
print("\n🔧 Starting Data Preprocessing...")

# 3.1 Konversi feedback ke numerik
feedback_mapping = {
    'Sangat Membantu': 5,
    'Bermanfaat': 4, 
    'Cukup': 3,
    'Kurang': 2,
    'Tidak Membantu': 1
}

data_interaksi['feedback_score'] = data_interaksi['feedback'].map(feedback_mapping)

# 3.2 Konversi status ke numerik
data_interaksi['completion_score'] = data_interaksi['status'].map({
    'Selesai': 1,
    'Belum Selesai': 0
})

# 3.3 Feature engineering untuk durasi normalized
data_interaksi = data_interaksi.merge(data_konten[['id_konten', 'durasi']], on='id_konten', how='left')
data_interaksi['watch_ratio'] = data_interaksi['durasi_tonton'] / data_interaksi['durasi']
data_interaksi['watch_ratio'] = data_interaksi['watch_ratio'].clip(0, 1)  # Cap at 100%

# 3.4 Gabung semua data
print("🔗 Merging datasets...")
full_data = data_interaksi.merge(data_mahasiswa, on='id_mahasiswa', how='left')
full_data = full_data.merge(data_konten, on='id_konten', how='left')

print(f"✅ Merged data shape: {full_data.shape}")

# 3.5 Handle missing values
print("🛠️ Handling missing values...")
numeric_cols = full_data.select_dtypes(include=[np.number]).columns
full_data[numeric_cols] = full_data[numeric_cols].fillna(full_data[numeric_cols].median())

categorical_cols = full_data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    full_data[col] = full_data[col].fillna('Unknown')

# 3.6 Label Encoding untuk fitur kategorikal
print("🏷️ Encoding categorical features...")
label_encoders = {}
categorical_features = ['jurusan', 'device_preference', 'learning_style', 'goal', 
                       'ketersediaan_belajar', 'mata_kuliah', 'platform', 'format', 'kesulitan']

for feature in categorical_features:
    if feature in full_data.columns:
        le = LabelEncoder()
        full_data[f'{feature}_encoded'] = le.fit_transform(full_data[feature].astype(str))
        label_encoders[feature] = le

print(f"✅ Label encoders created for {len(label_encoders)} features")


🔧 Starting Data Preprocessing...
🔗 Merging datasets...
✅ Merged data shape: (3000, 28)
🛠️ Handling missing values...
🏷️ Encoding categorical features...
✅ Label encoders created for 9 features


## 4. FEATURE ENGINEERING

In [82]:
print("\n⚙️ Feature Engineering...")

# 4.1 User profile features (untuk content-based filtering)
user_features = ['jurusan_encoded', 'learning_style_encoded', 'goal_encoded', 
                'ketersediaan_belajar_encoded', 'device_preference_encoded', 'ipk_terakhir', 'waktu_belajar_per_hari']

# 4.2 Content features 
content_features_for_profiles = ['mata_kuliah_encoded', 'platform_encoded', 'format_encoded', 'kesulitan_encoded', 'durasi', 'rating_pengguna']
content_features_for_training = ['mata_kuliah_encoded', 'platform_encoded', 'format_encoded', 'kesulitan_encoded', 'durasi_y', 'rating_pengguna']

# 4.3 Interaction features
interaction_features = ['feedback_score', 'completion_score', 'watch_ratio', 'durasi_tonton']

# 4.4 Create user-item matrix untuk collaborative filtering
print("📊 Creating user-item matrix...")
user_item_matrix = full_data.pivot_table(
    index='id_mahasiswa', 
    columns='id_konten', 
    values='feedback_score', 
    fill_value=0
)

print(f"✅ User-item matrix shape: {user_item_matrix.shape}")


⚙️ Feature Engineering...
📊 Creating user-item matrix...
✅ User-item matrix shape: (300, 200)


## 5. MODEL DEVELOPMENT

In [83]:
print("\n🤖 Building Recommendation Models...")

# 5.1 Content-Based Filtering
print("🎯 Building Content-Based Model...")

# Siapkan user profiles dan content profiles
user_profiles = full_data.groupby('id_mahasiswa')[user_features].first()
content_profiles = data_konten.copy()

# Encode content profiles
for feature in ['mata_kuliah', 'platform', 'format', 'kesulitan']:
    if feature in label_encoders:
        content_profiles[f'{feature}_encoded'] = label_encoders[feature].transform(content_profiles[feature].astype(str))

# Normalisasi features
scaler_user = StandardScaler()
scaler_content = StandardScaler()

user_profiles_scaled = scaler_user.fit_transform(user_profiles[user_features])
content_profiles_scaled = scaler_content.fit_transform(content_profiles[content_features_for_profiles])

print("✅ Content-based features prepared")

# 5.2 Collaborative Filtering dengan Cosine Similarity
print("👥 Building Collaborative Filtering Model...")

# User-based collaborative filtering
user_similarity = cosine_similarity(user_item_matrix.values)
user_similarity_df = pd.DataFrame(user_similarity, 
                                 index=user_item_matrix.index, 
                                 columns=user_item_matrix.index)

# Item-based collaborative filtering  
item_similarity = cosine_similarity(user_item_matrix.T.values)
item_similarity_df = pd.DataFrame(item_similarity,
                                 index=user_item_matrix.columns,
                                 columns=user_item_matrix.columns)

print("✅ Similarity matrices computed")

# 5.3 Hybrid Model dengan Random Forest
print("🌳 Building Hybrid Model with Random Forest...")

# Prepare training data
training_data = full_data.copy()
print("🧾 Training Data Columns:", training_data.columns.tolist())


# Features untuk hybrid model
hybrid_features = user_features + content_features_for_training + ['watch_ratio']
X = training_data[hybrid_features]
y = training_data['feedback_score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"✅ Random Forest MSE: {mse:.4f}")
print(f"✅ Random Forest MAE: {mae:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': hybrid_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n📊 Top 5 Feature Importance:")
print(feature_importance.head())


🤖 Building Recommendation Models...
🎯 Building Content-Based Model...
✅ Content-based features prepared
👥 Building Collaborative Filtering Model...
✅ Similarity matrices computed
🌳 Building Hybrid Model with Random Forest...
🧾 Training Data Columns: ['id_interaksi', 'id_mahasiswa', 'id_konten', 'waktu_akses', 'durasi_tonton', 'feedback', 'device', 'status', 'feedback_score', 'completion_score', 'durasi_x', 'watch_ratio', 'nama', 'jurusan', 'angkatan', 'ipk_terakhir', 'device_preference', 'learning_style', 'goal', 'waktu_belajar_per_hari', 'ketersediaan_belajar', 'judul', 'mata_kuliah', 'platform', 'format', 'durasi_y', 'kesulitan', 'rating_pengguna', 'jurusan_encoded', 'device_preference_encoded', 'learning_style_encoded', 'goal_encoded', 'ketersediaan_belajar_encoded', 'mata_kuliah_encoded', 'platform_encoded', 'format_encoded', 'kesulitan_encoded']
✅ Random Forest MSE: 1.6337
✅ Random Forest MAE: 1.1241

📊 Top 5 Feature Importance:
            feature  importance
5      ipk_terakhir

## 5. MODEL DEVELOPMENT

In [84]:
print("\n🤖 Building Recommendation Models...")

# 5.1 Content-Based Filtering
print("🎯 Building Content-Based Model...")

# Siapkan user profiles dan content profiles
user_profiles = full_data.groupby('id_mahasiswa')[user_features].first()
content_profiles = data_konten.copy()

# Encode content profiles
for feature in ['mata_kuliah', 'platform', 'format', 'kesulitan']:
    if feature in label_encoders:
        content_profiles[f'{feature}_encoded'] = label_encoders[feature].transform(content_profiles[feature].astype(str))

# Normalisasi features
scaler_user = StandardScaler()
scaler_content = StandardScaler()

user_profiles_scaled = scaler_user.fit_transform(user_profiles[user_features])
content_profiles_scaled = scaler_content.fit_transform(content_profiles[content_features_for_profiles])

print("✅ Content-based features prepared")

# 5.2 Collaborative Filtering dengan Cosine Similarity
print("👥 Building Collaborative Filtering Model...")

# User-based collaborative filtering
user_similarity = cosine_similarity(user_item_matrix.values)
user_similarity_df = pd.DataFrame(user_similarity, 
                                 index=user_item_matrix.index, 
                                 columns=user_item_matrix.index)

# Item-based collaborative filtering  
item_similarity = cosine_similarity(user_item_matrix.T.values)
item_similarity_df = pd.DataFrame(item_similarity,
                                 index=user_item_matrix.columns,
                                 columns=user_item_matrix.columns)

print("✅ Similarity matrices computed")

# 5.3 Hybrid Model dengan Random Forest
print("🌳 Building Hybrid Model with Random Forest...")

# Prepare training data
training_data = full_data.copy()

# Features untuk hybrid model
hybrid_features = user_features + content_features_for_training + ['watch_ratio']
X = training_data[hybrid_features]
y = training_data['feedback_score']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f"✅ Random Forest MSE: {mse:.4f}")
print(f"✅ Random Forest MAE: {mae:.4f}")

# Feature importance
feature_importance = pd.DataFrame({
    'feature': hybrid_features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\n📊 Top 5 Feature Importance:")
print(feature_importance.head())


🤖 Building Recommendation Models...
🎯 Building Content-Based Model...
✅ Content-based features prepared
👥 Building Collaborative Filtering Model...
✅ Similarity matrices computed
🌳 Building Hybrid Model with Random Forest...
✅ Random Forest MSE: 1.6337
✅ Random Forest MAE: 1.1241

📊 Top 5 Feature Importance:
            feature  importance
5      ipk_terakhir    0.180534
12  rating_pengguna    0.140409
13      watch_ratio    0.138158
11         durasi_y    0.129849
0   jurusan_encoded    0.058849


## 6. MODEL EVALUATION

In [85]:
print("\n📈 Model Evaluation...")

def evaluate_recommendations(user_id, top_k=5):
    """Evaluate recommendation quality untuk user tertentu"""
    # Ambil interaksi historis user
    user_history = full_data[full_data['id_mahasiswa'] == user_id]
    
    if len(user_history) == 0:
        return None
        
    # Ground truth: konten dengan rating tinggi (>= 4)
    ground_truth = set(user_history[user_history['feedback_score'] >= 4]['id_konten'].values)
    
    return len(ground_truth)

# Sample evaluation
sample_users = user_item_matrix.index[:10]  
eval_results = []

for user_id in sample_users:
    gt_count = evaluate_recommendations(user_id)
    if gt_count is not None:
        eval_results.append({'user_id': user_id, 'ground_truth_count': gt_count})

eval_df = pd.DataFrame(eval_results)
print(f"✅ Evaluation completed for {len(eval_df)} users")
print(f"📊 Average ground truth items per user: {eval_df['ground_truth_count'].mean():.2f}")


📈 Model Evaluation...
✅ Evaluation completed for 10 users
📊 Average ground truth items per user: 6.40


## 7. SAVE MODELS

In [89]:
print("\n💾 Saving models and encoders...")

models_to_save = {
    "rf_model": rf_model,
    "user_item_matrix": user_item_matrix,
    "user_similarity_df": user_similarity_df,
    "item_similarity_df": item_similarity_df,
    "label_encoders": label_encoders,
    "scaler_user": scaler_user,
    "scaler_content": scaler_content,
    "user_profiles": user_profiles,
    "content_profiles": content_profiles,
    'hybrid_features': hybrid_features,
    'user_features': user_features,
    'content_features': content_features_for_profiles
}

# Save models
for name, model in models_to_save.items():
    with open(f'{name}.pkl', 'wb') as f:
        pickle.dump(model, f)
    print(f"✅ Saved {name}.pkl")

print("\n🎉 Model development completed!")
print("📁 Files created:")
print("   - rf_model.pkl (Random Forest model)")
print("   - user_item_matrix.pkl (User-item interaction matrix)")
print("   - user_similarity_df.pkl (User similarity matrix)")
print("   - item_similarity_df.pkl (Item similarity matrix)")
print("   - label_encoders.pkl (Label encoders)")
print("   - scaler_user.pkl & scaler_content.pkl (Feature scalers)")
print("   - user_profiles.pkl & content_profiles.pkl (Profile data)")


💾 Saving models and encoders...
✅ Saved rf_model.pkl
✅ Saved user_item_matrix.pkl
✅ Saved user_similarity_df.pkl
✅ Saved item_similarity_df.pkl
✅ Saved label_encoders.pkl
✅ Saved scaler_user.pkl
✅ Saved scaler_content.pkl
✅ Saved user_profiles.pkl
✅ Saved content_profiles.pkl
✅ Saved hybrid_features.pkl
✅ Saved user_features.pkl
✅ Saved content_features.pkl

🎉 Model development completed!
📁 Files created:
   - rf_model.pkl (Random Forest model)
   - user_item_matrix.pkl (User-item interaction matrix)
   - user_similarity_df.pkl (User similarity matrix)
   - item_similarity_df.pkl (Item similarity matrix)
   - label_encoders.pkl (Label encoders)
   - scaler_user.pkl & scaler_content.pkl (Feature scalers)
   - user_profiles.pkl & content_profiles.pkl (Profile data)


## 8. QUICK TEST

In [90]:
print("\n🧪 Quick Test...")

def quick_recommend_test(user_id, top_k=5):
    """Test rekomendasi untuk user tertentu"""
    if user_id not in user_item_matrix.index:
        print(f"❌ User {user_id} tidak ditemukan")
        return
        
    # Ambil user profile
    user_profile = user_profiles.loc[user_id]
    
    # Hybrid approach: gabung collaborative + content-based
    user_ratings = user_item_matrix.loc[user_id]
    unrated_items = user_ratings[user_ratings == 0].index
    
    if len(unrated_items) == 0:
        print(f"✨ User {user_id} sudah menilai semua konten")
        return
        
    # Simple collaborative filtering prediction
    similar_users = user_similarity_df.loc[user_id].nlargest(10).index[1:]  # Exclude self
    
    recommendations = []
    for item in unrated_items[:20]:  # Sample untuk test
        # Collaborative score
        collab_score = 0
        count = 0
        for sim_user in similar_users:
            if user_item_matrix.loc[sim_user, item] > 0:
                collab_score += user_item_matrix.loc[sim_user, item]
                count += 1
        
        if count > 0:
            collab_score /= count
            recommendations.append((item, collab_score))
    
    # Sort dan ambil top-k
    recommendations.sort(key=lambda x: x[1], reverse=True)
    top_recommendations = recommendations[:top_k]
    
    print(f"🎯 Top {top_k} recommendations for {user_id}:")
    for i, (item_id, score) in enumerate(top_recommendations, 1):
        item_info = data_konten[data_konten['id_konten'] == item_id].iloc[0]
        print(f"   {i}. {item_info['judul'][:50]}... (Score: {score:.3f})")

# Test dengan user pertama
if len(user_item_matrix.index) > 0:
    test_user = user_item_matrix.index[0]
    quick_recommend_test(test_user)

print("\n✅ Notebook execution completed! Ready for production deployment.")


🧪 Quick Test...
🎯 Top 5 recommendations for MHS001:
   1. Behavior reality issue last mother apply... (Score: 5.000)
   2. He candidate investment out natural... (Score: 2.000)
   3. Employee result manager... (Score: 2.000)
   4. Finish land never can... (Score: 2.000)
   5. Whole local begin source cultural marriage item... (Score: 2.000)

✅ Notebook execution completed! Ready for production deployment.
