<a href="https://colab.research.google.com/github/armaanranjan/Course_Compass/blob/main/Course_recommender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Step 1: Load and Clean the Dataset
def load_and_clean_data(file_path):
    # Load the dataset
    df = pd.read_csv(file_path)

    # Select the required columns
    df = df[['Course Title', 'Rating', 'Level', 'Duration to complete (Approx.)']]

    # Rename the columns for consistency
    df.columns = ['course_title', 'rating', 'level', 'duration_hours']

    # Convert the 'rating' column to numeric values
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')

    # Convert the 'duration_hours' column to string type before using .str accessor
    df['duration_hours'] = df['duration_hours'].astype(str)

    # Convert the 'duration_hours' column to numeric values
    df['duration_hours'] = df['duration_hours'].str.extract('(\d+)', expand=False).astype(float)

    # Create a unique index for each course
    df['course_idx'] = range(1, len(df) + 1)

    # Drop any rows with missing values
    df.dropna(inplace=True)

    # Reset the index
    df.reset_index(drop=True, inplace=True)

    return df

# Step 2: Define the Course Recommender Class

class CourseRecommender:
    def __init__(self, cleaned_df):
        self.df = cleaned_df
        self.tfidf = TfidfVectorizer(stop_words='english')

        # Create TF-IDF matrix for content-based filtering
        self.tfidf_matrix = self.tfidf.fit_transform(self.df['course_title'])
        self.content_similarity = cosine_similarity(self.tfidf_matrix)

        # Create user-item matrix for collaborative filtering
        # For this example, we'll use course ratings as implicit feedback
        self.course_rating_matrix = csr_matrix((
            np.ones(len(self.df)),  # Assuming each course has been taken at least once
            (
                np.arange(len(self.df)),  # Course indices
                self.df['course_idx'].values
            )
        ))

        # Initialize collaborative filtering model
        self.cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
        self.cf_model.fit(self.course_rating_matrix)

    def search_courses(self, keyword, level=None, top_n=10):
        """
        Search courses based on keyword and level preference
        """
        # Convert keyword to lowercase for case-insensitive search
        keyword = keyword.lower()

        # Filter courses that contain the keyword
        mask = self.df['course_title'].str.lower().str.contains(keyword)
        matching_courses = self.df[mask].copy()

        if matching_courses.empty:
            return pd.DataFrame()

        # Sort by level preference if specified
        if level:
            # Put preferred level courses first
            matching_courses['level_match'] = (matching_courses['level'] == level).astype(int)
            matching_courses = matching_courses.sort_values(
                by=['level_match', 'rating'],
                ascending=[False, False]
            )
        else:
            # Sort by rating only
            matching_courses = matching_courses.sort_values('rating', ascending=False)

        return matching_courses.head(top_n)

    def get_content_based_recommendations(self, course_idx, top_n=5):
        """
        Get content-based recommendations based on course similarity
        """
        course_row = self.df[self.df['course_idx'] == course_idx].index[0]
        similar_scores = self.content_similarity[course_row]
        similar_courses = list(enumerate(similar_scores))

        # Sort by similarity score
        similar_courses = sorted(similar_courses, key=lambda x: x[1], reverse=True)
        similar_courses = similar_courses[1:top_n+1]  # Exclude the input course

        course_indices = [i[0] for i in similar_courses]
        return self.df.iloc[course_indices]

    def get_collaborative_recommendations(self, course_idx, top_n=5):
        """
        Get collaborative filtering recommendations based on course ratings
        """
        course_row = self.df[self.df['course_idx'] == course_idx].index[0]
        distances, indices = self.cf_model.kneighbors(
            self.course_rating_matrix[course_row].toarray().reshape(1, -1),
            n_neighbors=top_n+1
        )

        # Exclude the input course
        similar_course_indices = indices.flatten()[1:]
        return self.df.iloc[similar_course_indices]

    def get_hybrid_recommendations(self, keyword, level=None, course_idx=None, top_n=10):
        """
        Get hybrid recommendations combining keyword search, content-based, and collaborative filtering
        """
        keyword_results = self.search_courses(keyword, level, top_n=top_n)

        if course_idx is not None:
            # Get content-based and collaborative recommendations
            content_recommendations = self.get_content_based_recommendations(course_idx, top_n=top_n)
            collab_recommendations = self.get_collaborative_recommendations(course_idx, top_n=top_n)

            # Combine and deduplicate recommendations
            all_recommendations = pd.concat([keyword_results, content_recommendations, collab_recommendations])
            hybrid_recommendations = all_recommendations.drop_duplicates(subset=['course_idx'])

            # Sort by rating and level match (if specified)
            if level:
                hybrid_recommendations['level_match'] = (hybrid_recommendations['level'] == level).astype(int)
                hybrid_recommendations = hybrid_recommendations.sort_values(
                    by=['level_match', 'rating'],
                    ascending=[False, False]
                )
            else:
                hybrid_recommendations = hybrid_recommendations.sort_values('rating', ascending=False)

            return hybrid_recommendations.head(top_n)
        else:
            # If no course_idx is given, return keyword-based results
            return keyword_results


# Step 1: Load and Clean the Dataset
file_path = '/content/CourseraDataset-Clean.csv'  # Update this path as needed
cleaned_df = load_and_clean_data(file_path)

# Step 2: Initialize the Course Recommender
recommender = CourseRecommender(cleaned_df)

# Step 3: Test the Model

# Example 1: Search for courses with a specific keyword
print("Searching for 'python' courses:")
python_courses = recommender.search_courses('python')
print(python_courses[['course_title', 'level', 'rating', 'duration_hours']])

# Example 2: Search for courses with a specific keyword and level preference
print("\nSearching for 'python' courses (Beginner level preferred):")
python_beginner = recommender.search_courses('python', level='Beginner')
print(python_beginner[['course_title', 'level', 'rating', 'duration_hours']])

# Example 3: Get content-based recommendations for a specific course index
course_idx = 1  # Change this to a valid course index from your dataset
print(f"\nContent-based recommendations for course index {course_idx}:")
content_recommendations = recommender.get_content_based_recommendations(course_idx)
print(content_recommendations[['course_title', 'level', 'rating', 'duration_hours']])

# Example 4: Get collaborative recommendations for a specific course index
print(f"\nCollaborative recommendations for course index {course_idx}:")
collab_recommendations = recommender.get_collaborative_recommendations(course_idx)
print(collab_recommendations[['course_title', 'level', 'rating', 'duration_hours']])

# Example 5: Get hybrid recommendations combining keyword search and course index
print("\nHybrid recommendations for 'python' with course index 1:")
hybrid_recommendations = recommender.get_hybrid_recommendations(
    keyword='python',
    level='Beginner',
    course_idx=course_idx
)
print(hybrid_recommendations[['course_title', 'level', 'rating', 'duration_hours']])


Searching for 'python' courses:
                                           course_title           level  \
2818                             Python Data Structures  Beginner level   
5202  用 Python 做商管程式設計（一）(Programming for Business C...  Beginner level   
3166          Python Functions, Files, and Dictionaries  Beginner level   
5376  Introdução à Ciência da Computação com Python ...  Beginner level   
3340  Introdução à Ciência da Computação com Python ...  Beginner level   
1495  Introdução à Ciência da Computação com Python ...  Beginner level   
1486                             Python Data Structures  Beginner level   
5362  Introdução à Ciência da Computação com Python ...  Beginner level   
2357  Design Computing: 3D Modeling in Rhinoceros wi...  Beginner level   
2007          Python Functions, Files, and Dictionaries  Beginner level   

      rating  duration_hours  
2818     4.9            19.0  
5202     4.9            13.0  
3166     4.9            31.0  
5376     4.9      