<a href="https://colab.research.google.com/github/armaanranjan/Course_Compass/blob/main/Course_Recomender.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import re

class CourseDataCleaner:
    def __init__(self):
        self.df = '/content/CourseraDataset-Clean.csv'

    def clean_duration(self, duration):
        """Convert duration strings to hours"""
        try:
            if pd.isna(duration):
                return None

            duration = str(duration).lower()
            # Extract numbers
            numbers = re.findall(r'\d+\.?\d*', duration)
            if not numbers:
                return None

            number = float(numbers[0])

            # Convert to hours
            if 'minute' in duration:
                return round(number / 60, 1)
            elif 'hour' in duration:
                return round(number, 1)
            else:
                return None
        except:
            return None

    def clean_level(self, level):
        """Standardize level categories"""
        if pd.isna(level):
            return 'Not Specified'

        level = str(level).lower().strip()

        if 'beginner' in level or 'basic' in level or 'fundamental' in level:
            return 'Beginner'
        elif 'intermediate' in level or 'medium' in level:
            return 'Intermediate'
        elif 'advanced' in level or 'expert' in level:
            return 'Advanced'
        elif 'all level' in level or 'all-level' in level:
            return 'All Levels'
        else:
            return 'Not Specified'

    def clean_rating(self, rating):
        """Clean and validate rating values"""
        try:
            rating = float(rating)
            if 0 <= rating <= 5:
                return round(rating, 1)
            return None
        except:
            return None

    def clean_course_title(self, title):
        """Clean course titles"""
        if pd.isna(title):
            return None

        # Remove special characters and extra whitespace
        title = re.sub(r'[^\w\s-]', ' ', str(title))
        title = ' '.join(title.split())
        return title.strip()

    def clean_dataset(self, df):
        """
        Clean the dataset with renamed columns

        Parameters:
        df: DataFrame with columns ['course_title', 'rating', 'level', 'duration']

        Returns:
        Cleaned DataFrame with standardized column names
        """
        self.df = df.copy()

        # Rename columns if they exist in their old format
        column_mapping = {
            'title': 'course_title',
            'Duration to complete (Approx.)': 'duration',
            'course_id': 'course_idx'
        }

        # Rename columns if they exist
        for old_col, new_col in column_mapping.items():
            if old_col in self.df.columns:
                self.df = self.df.rename(columns={old_col: new_col})

        # Clean individual columns
        self.df['course_title'] = self.df['course_title'].apply(self.clean_course_title)
        self.df['rating'] = self.df['rating'].apply(self.clean_rating)
        self.df['level'] = self.df['level'].apply(self.clean_level)
        self.df['duration_hours'] = self.df['duration'].apply(self.clean_duration)

        # Drop rows with missing values
        self.df = self.df.dropna(subset=['course_title'])  # Course title is required

        # Fill missing values
        self.df['rating'] = self.df['rating'].fillna(self.df['rating'].mean())
        self.df['duration_hours'] = self.df['duration_hours'].fillna(self.df['duration_hours'].median())

        # Drop the original duration column
        self.df = self.df.drop('duration', axis=1)

        # Add a course_idx column if it doesn't exist
        if 'course_idx' not in self.df.columns:
            self.df['course_idx'] = range(1, len(self.df) + 1)

        # Ensure columns are in a consistent order
        column_order = ['course_idx', 'course_title', 'rating', 'level', 'duration_hours']
        self.df = self.df[column_order]

        return self.df

    def get_summary_stats(self):
        """Get summary statistics of the cleaned dataset"""
        if self.df is None:
            return "No dataset has been cleaned yet."

        stats = {
            'total_courses': len(self.df),
            'average_rating': round(self.df['rating'].mean(), 2),
            'average_duration_hours': round(self.df['duration_hours'].mean(), 2),
            'level_distribution': self.df['level'].value_counts().to_dict(),
            'rating_distribution': self.df['rating'].value_counts(bins=5).sort_index().to_dict()
        }

        return stats

# Example usage with sample data
def create_sample_data(n_rows=100):
    """Create sample data with new column names"""
    np.random.seed(42)

    data = {
        'course_title': [
            f"Course {i} - {'Basic' if i % 3 == 0 else 'Advanced' if i % 3 == 1 else 'Intermediate'}"
            for i in range(n_rows)
        ],
        'rating': np.random.uniform(3.0, 5.0, n_rows),
        'level': np.random.choice(
            ['Beginner', 'Intermediate', 'Advanced', 'All Levels'],
            n_rows
        ),
        'duration': [
            f"{np.random.randint(1, 100)} {'hours' if i % 2 == 0 else 'minutes'}"
            for i in range(n_rows)
        ]
    }

    return pd.DataFrame(data)

# Example usage
if __name__ == "__main__":
    # Create and clean sample data
    sample_df = create_sample_data()
    cleaner = CourseDataCleaner()
    cleaned_df = cleaner.clean_dataset(sample_df)

    # Print summary statistics
    print("\nDataset Summary:")
    print(cleaner.get_summary_stats())

    # Display first few rows of cleaned data
    print("\nCleaned Dataset Sample:")
    print(cleaned_df.head())

    # Display data info
    print("\nDataset Info:")
    print(cleaned_df.info())


Dataset Summary:
{'total_courses': 100, 'average_rating': 3.94, 'average_duration_hours': 26.25, 'level_distribution': {'Beginner': 28, 'All Levels': 27, 'Advanced': 23, 'Intermediate': 22}, 'rating_distribution': {Interval(2.9970000000000003, 3.4, closed='right'): 29, Interval(3.4, 3.8, closed='right'): 17, Interval(3.8, 4.2, closed='right'): 21, Interval(4.2, 4.6, closed='right'): 18, Interval(4.6, 5.0, closed='right'): 15}}

Cleaned Dataset Sample:
   course_idx             course_title  rating       level  duration_hours
0           1         Course 0 - Basic     3.7    Advanced            97.0
1           2      Course 1 - Advanced     4.9  All Levels             0.0
2           3  Course 2 - Intermediate     4.5    Advanced            19.0
3           4         Course 3 - Basic     4.2    Beginner             0.0
4           5      Course 4 - Advanced     3.3  All Levels            53.0

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data co

In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from collections import defaultdict

class CourseRecommender:
    def __init__(self, cleaned_df):
        """
        Initialize the recommender system with cleaned course data

        Parameters:
        cleaned_df: pandas DataFrame with columns ['course_title', 'rating', 'course_level', 'duration']
        """
        self.df = cleaned_df
        self.tfidf = TfidfVectorizer(stop_words='english')

        # Create TF-IDF matrix for course titles
        self.tfidf_matrix = self.tfidf.fit_transform(self.df['course_title'])

        # Calculate similarity matrix
        self.similarity_matrix = cosine_similarity(self.tfidf_matrix)

        # Create level-based indices for faster filtering
        self.level_indices = defaultdict(list)
        for idx, level in enumerate(self.df['course_level']):
            self.level_indices[level].append(idx)

    def get_similar_courses(self, course_idx, n=5):
        """Get similar courses based on content similarity"""
        course_similarities = self.similarity_matrix[course_idx]
        similar_indices = course_similarities.argsort()[::-1][1:n+1]
        return similar_indices

    def collaborative_filtering(self, course_idx, n=5):
        """
        Implement collaborative filtering based on course ratings
        Returns courses with similar ratings patterns
        """
        course_rating = self.df.iloc[course_idx]['rating']
        rating_diff = abs(self.df['rating'] - course_rating)
        similar_courses = rating_diff.sort_values().index[1:n+1]
        return similar_courses

    def keyword_search(self, keyword, preferred_level=None, n=10):
        """
        Search for courses based on keyword and optional level preference

        Parameters:
        keyword: str - Search term
        preferred_level: str - Preferred course level (optional)
        n: int - Number of recommendations to return

        Returns:
        DataFrame with recommended courses
        """
        # Convert keyword to TF-IDF vector
        keyword_vector = self.tfidf.transform([keyword])

        # Calculate similarity with all courses
        similarities = cosine_similarity(keyword_vector, self.tfidf_matrix)[0]

        # Create DataFrame with similarities
        results = pd.DataFrame({
            'index': range(len(similarities)),
            'similarity': similarities,
            'rating': self.df['rating'].values,
            'level': self.df['course_level'].values
        })

        # Calculate combined score (similarity * rating)
        results['combined_score'] = results['similarity'] * results['rating']

        if preferred_level:
            # Sort by level preference first, then by combined score
            preferred_courses = results[results['level'] == preferred_level].sort_values(
                'combined_score', ascending=False
            )
            other_courses = results[results['level'] != preferred_level].sort_values(
                'combined_score', ascending=False
            )
            results = pd.concat([preferred_courses, other_courses])
        else:
            results = results.sort_values('combined_score', ascending=False)

        # Get top N recommendations
        top_indices = results['index'].head(n).values

        # Return recommended courses
        recommendations = self.df.iloc[top_indices].copy()
        recommendations['similarity_score'] = results['similarity'].head(n).values
        recommendations['combined_score'] = results['combined_score'].head(n).values

        return recommendations.sort_values('combined_score', ascending=False)

    def get_recommendations(self, query, preferred_level=None, n=10):
        """
        Main recommendation function combining all approaches

        Parameters:
        query: str - Search query
        preferred_level: str - Preferred course level (optional)
        n: int - Number of recommendations

        Returns:
        DataFrame with recommended courses
        """
        # Get initial recommendations based on keyword search
        recommendations = self.keyword_search(query, preferred_level, n)

        # For the top result, get similar courses based on content
        if len(recommendations) > 0:
            top_course_idx = recommendations.index[0]
            similar_courses = self.get_similar_courses(top_course_idx, n=5)

            # Add collaborative filtering recommendations
            collab_courses = self.collaborative_filtering(top_course_idx, n=5)

            # Combine all recommendations
            additional_courses = pd.Index(np.concatenate([similar_courses, collab_courses]))
            additional_df = self.df.loc[additional_courses].copy()

            # If preferred level is specified, prioritize those courses
            if preferred_level:
                level_mask = additional_df['course_level'] == preferred_level
                additional_df = pd.concat([
                    additional_df[level_mask],
                    additional_df[~level_mask]
                ])

            # Combine and remove duplicates
            recommendations = pd.concat([recommendations, additional_df]).drop_duplicates(
                subset=['course_title']
            ).head(n)

        return recommendations

# Example usage
def demonstrate_recommender(cleaned_df):
    """
    Demonstrate how to use the recommender system
    """
    # Initialize recommender
    recommender = CourseRecommender(cleaned_df)

    # Example searches
    print("Search for 'python' courses (all levels):")
    results = recommender.get_recommendations('python')
    print(results[['course_title', 'rating', 'course_level', 'duration', 'combined_score']])

    print("\nSearch for 'python' courses (Beginner level):")
    results = recommender.get_recommendations('python', preferred_level='Beginner')
    print(results[['course_title', 'rating', 'course_level', 'duration', 'combined_score']])

In [43]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

class CourseRecommender:
    def __init__(self, cleaned_df):
        self.df = cleaned_df
        self.tfidf = TfidfVectorizer(stop_words='english')

        # Create TF-IDF matrix for content-based filtering
        self.tfidf_matrix = self.tfidf.fit_transform(self.df['course_title'])
        self.content_similarity = cosine_similarity(self.tfidf_matrix)

        # Create user-item matrix for collaborative filtering
        # For this example, we'll use course ratings as implicit feedback
        self.course_rating_matrix = csr_matrix((
            np.ones(len(self.df)),  # Assuming each course has been taken at least once
            (
                np.arange(len(self.df)),  # Course indices
                self.df['course_idx'].values
            )
        ))

        # Initialize collaborative filtering model
        self.cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
        self.cf_model.fit(self.course_rating_matrix)

    def search_courses(self, keyword, level=None, top_n=10):
        """
        Search courses based on keyword and level preference
        """
        # Convert keyword to lowercase for case-insensitive search
        keyword = keyword.lower()

        # Filter courses that contain the keyword
        mask = self.df['course_title'].str.lower().str.contains(keyword)
        matching_courses = self.df[mask].copy()

        if matching_courses.empty:
            return pd.DataFrame()

        # Sort by level preference if specified
        if level:
            # Put preferred level courses first
            matching_courses['level_match'] = (matching_courses['level'] == level).astype(int)
            matching_courses = matching_courses.sort_values(
                by=['level_match', 'rating'],
                ascending=[False, False]
            )
        else:
            # Sort by rating only
            matching_courses = matching_courses.sort_values('rating', ascending=False)

        return matching_courses.head(top_n)

    def get_content_based_recommendations(self, course_idx, top_n=5):
        """
        Get content-based recommendations based on course similarity
        """
        course_row = self.df[self.df['course_idx'] == course_idx].index[0]
        similar_scores = self.content_similarity[course_row]
        similar_courses = list(enumerate(similar_scores))

        # Sort by similarity score
        similar_courses = sorted(similar_courses, key=lambda x: x[1], reverse=True)
        similar_courses = similar_courses[1:top_n+1]  # Exclude the input course

        course_indices = [i[0] for i in similar_courses]
        return self.df.iloc[course_indices]

    def get_collaborative_recommendations(self, course_idx, top_n=5):
        """
        Get collaborative filtering recommendations based on course ratings
        """
        course_row = self.df[self.df['course_idx'] == course_idx].index[0]
        distances, indices = self.cf_model.kneighbors(
            self.course_rating_matrix[course_row].toarray().reshape(1, -1),
            n_neighbors=top_n+1
        )

        # Exclude the input course
        similar_course_indices = indices.flatten()[1:]
        return self.df.iloc[similar_course_indices]

    def get_hybrid_recommendations(self, keyword, level=None, course_idx=None, top_n=10):
        """
        Get hybrid recommendations combining keyword search, content-based, and collaborative filtering
        """
        # Get initial keyword-based results
        keyword_results = self.search_courses(keyword, level, top_n=top_n)

        if course_idx is not None:
            # Get content-based and collaborative recommendations
            content_recommendations = self.get_content_based_recommendations(course_idx, top_n=top_n)
            collab_recommendations = self.get_collaborative_recommendations(course_idx, top_n=top_n)

            # Combine all recommendations
            all_recommendations = pd.concat([
                keyword_results,
                content_recommendations,
                collab_recommendations
            ]).drop_duplicates(subset=['course_idx'])

            # Sort by level preference if specified
            if level:
                all_recommendations['level_match'] = (all_recommendations['level'] == level).astype(int)
                all_recommendations = all_recommendations.sort_values(
                    by=['level_match', 'rating'],
                    ascending=[False, False]
                )
            else:
                all_recommendations = all_recommendations.sort_values('rating', ascending=False)

            return all_recommendations.head(top_n)

        return keyword_results

# Example usage
def demonstrate_recommender(cleaned_df):
    # Initialize the recommender
    recommender = CourseRecommender(cleaned_df)

    # Example 1: Search courses with keyword
    print("Searching for 'python' courses:")
    python_courses = recommender.search_courses('python')
    print(python_courses[['course_title', 'level', 'rating']])

    # Example 2: Search with level preference
    print("\nSearching for 'python' courses (Beginner level preferred):")
    python_beginner = recommender.search_courses('python', level='Beginner')
    print(python_beginner[['course_title', 'level', 'rating']])

    # Example 3: Get hybrid recommendations
    print("\nHybrid recommendations for 'python' with course_idx=1:")
    hybrid_recommendations = recommender.get_hybrid_recommendations(
        keyword='python',
        level='Beginner',
        course_idx=1
    )
    print(hybrid_recommendations[['course_title', 'level', 'rating']])

# Run the demonstration
# demonstrate_recommender(cleaned_df)

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, precision_score, recall_score, ndcg_score
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

class CourseRecommender:
    def __init__(self, cleaned_df):
        if cleaned_df.empty:
            raise ValueError("The input DataFrame is empty")

        self.df = cleaned_df.copy()
        self.df = self.df.reset_index(drop=True)  # Reset index to avoid any gaps

        # Validate required columns
        required_columns = ['course_idx', 'course_title', 'rating', 'level', 'duration_hours']
        missing_columns = [col for col in required_columns if col not in self.df.columns]
        if missing_columns:
            raise ValueError(f"Missing required columns: {missing_columns}")

        # Initialize TF-IDF
        self.tfidf = TfidfVectorizer(stop_words='english')

        try:
            # Create TF-IDF matrix for content-based filtering
            self.tfidf_matrix = self.tfidf.fit_transform(self.df['course_title'].astype(str))
            self.content_similarity = cosine_similarity(self.tfidf_matrix)

            # Create user-item matrix for collaborative filtering
            self.course_rating_matrix = csr_matrix((
                np.ones(len(self.df)),
                (
                    np.arange(len(self.df)),
                    self.df['course_idx'].values
                )
            ))

            # Initialize collaborative filtering model
            self.cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
            self.cf_model.fit(self.course_rating_matrix)

        except Exception as e:
            raise ValueError(f"Error initializing recommender: {str(e)}")

    def search_courses(self, keyword, level=None, top_n=10):
        """
        Search courses based on keyword and level preference
        """
        if not isinstance(keyword, str):
            raise ValueError("Keyword must be a string")

        # Convert keyword to lowercase for case-insensitive search
        keyword = keyword.lower()

        # Filter courses that contain the keyword
        mask = self.df['course_title'].str.lower().str.contains(keyword)
        matching_courses = self.df[mask].copy()

        if matching_courses.empty:
            print(f"No courses found matching keyword: {keyword}")
            return pd.DataFrame(columns=self.df.columns)

        # Sort by level preference if specified
        if level:
            matching_courses['level_match'] = (matching_courses['level'] == level).astype(int)
            matching_courses = matching_courses.sort_values(
                by=['level_match', 'rating'],
                ascending=[False, False]
            )
        else:
            matching_courses = matching_courses.sort_values('rating', ascending=False)

        return matching_courses.head(top_n)

    def get_content_based_recommendations(self, course_idx, top_n=5):
        """
        Get content-based recommendations based on course similarity
        """
        try:
            course_row = self.df[self.df['course_idx'] == course_idx].index[0]
            similar_scores = self.content_similarity[course_row]
            similar_courses = list(enumerate(similar_scores))

            # Sort by similarity score
            similar_courses = sorted(similar_courses, key=lambda x: x[1], reverse=True)
            similar_courses = similar_courses[1:top_n+1]  # Exclude the input course

            course_indices = [i[0] for i in similar_courses]
            return self.df.iloc[course_indices]
        except IndexError:
            print(f"Course index {course_idx} not found")
            return pd.DataFrame(columns=self.df.columns)

class CourseRecommenderTrainer:
    def __init__(self, cleaned_df):
        # Validate input data
        if not isinstance(cleaned_df, pd.DataFrame):
            raise ValueError("Input must be a pandas DataFrame")

        if cleaned_df.empty:
            raise ValueError("Input DataFrame is empty")

        self.df = cleaned_df.copy()
        self.train_data = None
        self.test_data = None
        self.recommender = None

    def prepare_data(self, test_size=0.2, random_state=42):
        """
        Prepare training and testing datasets
        """
        if len(self.df) < 2:
            raise ValueError("Dataset must contain at least 2 samples for train-test split")

        try:
            # Split the data into training and testing sets
            self.train_data, self.test_data = train_test_split(
                self.df,
                test_size=test_size,
                random_state=random_state,
                stratify=self.df['level'] if len(self.df['level'].unique()) > 1 else None
            )

            # Initialize and train the recommender with training data
            self.recommender = CourseRecommender(self.train_data)

            print(f"Training set size: {len(self.train_data)}")
            print(f"Testing set size: {len(self.test_data)}")

        except Exception as e:
            raise ValueError(f"Error preparing data: {str(e)}")

# Function to validate and preprocess the dataset
def preprocess_dataset(df):
    """
    Validate and preprocess the input dataset
    """
    # Check if DataFrame is empty
    if df.empty:
        raise ValueError("Input DataFrame is empty")

    # Check required columns
    required_columns = ['course_idx', 'course_title', 'rating', 'level', 'duration_hours']
    missing_columns = [col for col in required_columns if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns: {missing_columns}")

    # Create a copy of the DataFrame
    cleaned_df = df.copy()

    # Reset index
    cleaned_df = cleaned_df.reset_index(drop=True)

    # Convert course_title to string
    cleaned_df['course_title'] = cleaned_df['course_title'].astype(str)

    # Convert rating to float
    cleaned_df['rating'] = pd.to_numeric(cleaned_df['rating'], errors='coerce')

    # Convert duration_hours to float
    cleaned_df['duration_hours'] = pd.to_numeric(cleaned_df['duration_hours'], errors='coerce')

    # Remove rows with missing values
    cleaned_df = cleaned_df.dropna()

    return cleaned_df

def train_and_evaluate_recommender(df):
    """
    Complete training and evaluation pipeline with error handling
    """
    try:
        # Preprocess the dataset
        print("Preprocessing dataset...")
        cleaned_df = preprocess_dataset(df)

        # Initialize trainer
        print("Initializing trainer...")
        trainer = CourseRecommenderTrainer(cleaned_df)

        # Prepare data
        print("Preparing data...")
        trainer.prepare_data()

        return trainer

    except Exception as e:
        print(f"Error in training pipeline: {str(e)}")
        return None

# Example usage

# Assuming your DataFrame is called 'cleaned_df'
# First, let's check the data
print("Dataset shape:", cleaned_df.shape)
print("\nDataset columns:", cleaned_df.columns.tolist())
print("\nSample of the data:")
print(cleaned_df.head())

# Train the recommender
trainer = train_and_evaluate_recommender(cleaned_df)

if trainer is not None:
    # Get recommendations for a specific course
    recommender = trainer.recommender
    recommendations = recommender.search_courses(
        keyword="Fashion",
        level="Beginner"
    )
    print("\nSample recommendations:")
    print(recommendations[['course_title', 'level', 'rating']])


Dataset shape: (100, 5)

Dataset columns: ['course_idx', 'course_title', 'rating', 'level', 'duration_hours']

Sample of the data:
   course_idx             course_title  rating       level  duration_hours
0           1         Course 0 - Basic     3.7    Advanced            97.0
1           2      Course 1 - Advanced     4.9  All Levels             0.0
2           3  Course 2 - Intermediate     4.5    Advanced            19.0
3           4         Course 3 - Basic     4.2    Beginner             0.0
4           5      Course 4 - Advanced     3.3  All Levels            53.0
Preprocessing dataset...
Initializing trainer...
Preparing data...
Training set size: 80
Testing set size: 20
No courses found matching keyword: fashion

Sample recommendations:
Empty DataFrame
Columns: [course_title, level, rating]
Index: []


In [47]:
  import pandas as pd
  import numpy as np
  from sklearn.feature_extraction.text import TfidfVectorizer
  from sklearn.metrics.pairwise import cosine_similarity
  from scipy.sparse import csr_matrix
  from sklearn.neighbors import NearestNeighbors
  import re

  class CourseRecommender:
      def __init__(self, cleaned_df):
          if cleaned_df.empty:
              raise ValueError("The input DataFrame is empty")

          self.df = cleaned_df.copy()
          self.df = self.df.reset_index(drop=True)

          # Create a normalized version of course titles for better searching
          self.df['normalized_title'] = self.df['course_title'].apply(self.normalize_text)

          # Initialize TF-IDF
          self.tfidf = TfidfVectorizer(stop_words='english')

          try:
              # Create TF-IDF matrix for content-based filtering
              self.tfidf_matrix = self.tfidf.fit_transform(self.df['course_title'].astype(str))
              self.content_similarity = cosine_similarity(self.tfidf_matrix)

              # Create user-item matrix for collaborative filtering
              self.course_rating_matrix = csr_matrix((
                  np.ones(len(self.df)),
                  (
                      np.arange(len(self.df)),
                      self.df['course_idx'].values
                  )
              ))

              # Initialize collaborative filtering model
              self.cf_model = NearestNeighbors(metric='cosine', algorithm='brute')
              self.cf_model.fit(self.course_rating_matrix)

          except Exception as e:
              raise ValueError(f"Error initializing recommender: {str(e)}")

      @staticmethod
      def normalize_text(text):
          """
          Normalize text for better matching:
          - Convert to lowercase
          - Remove special characters
          - Remove extra spaces
          """
          if not isinstance(text, str):
              text = str(text)

          # Convert to lowercase
          text = text.lower()
          # Remove special characters and extra spaces
          text = re.sub(r'[^\w\s]', ' ', text)
          # Remove extra whitespace
          text = ' '.join(text.split())
          return text

      def search_courses(self, keyword, level=None, top_n=10):
          """
          Enhanced search function with multiple matching methods
          """
          if not isinstance(keyword, str):
              raise ValueError("Keyword must be a string")

          # Normalize the search keyword
          normalized_keyword = self.normalize_text(keyword)

          # Create different matching criteria
          matches = pd.Series(False, index=self.df.index)

          # 1. Exact match in normalized title
          matches |= self.df['normalized_title'].str.contains(normalized_keyword, case=False, na=False)

          # 2. Partial word matches
          keyword_parts = normalized_keyword.split()
          for part in keyword_parts:
              if len(part) >= 3:  # Only search for parts with 3 or more characters
                  matches |= self.df['normalized_title'].str.contains(part, case=False, na=False)

          # 3. Use TF-IDF similarity for fuzzy matching if no direct matches found
          if not matches.any():
              # Transform the keyword using the fitted TF-IDF vectorizer
              keyword_vector = self.tfidf.transform([keyword])
              # Calculate similarity with all courses
              similarities = cosine_similarity(keyword_vector, self.tfidf_matrix)[0]
              # Get courses with similarity above threshold
              similarity_threshold = 0.1  # Adjust this threshold as needed
              matches = similarities > similarity_threshold

          # Get matching courses
          matching_courses = self.df[matches].copy()

          # If still no matches found, return empty DataFrame with proper columns
          if matching_courses.empty:
              print(f"No courses found matching keyword: {keyword}")
              return pd.DataFrame(columns=self.df.columns)

          # Calculate similarity scores for ranking
          if len(matching_courses) > 0:
              keyword_vector = self.tfidf.transform([keyword])
              matching_courses['similarity_score'] = cosine_similarity(
                  keyword_vector,
                  self.tfidf.transform(matching_courses['course_title'])
              )[0]

          # Sort by level preference and similarity score
          if level:
              matching_courses['level_match'] = (matching_courses['level'] == level).astype(int)
              matching_courses = matching_courses.sort_values(
                  by=['level_match', 'similarity_score', 'rating'],
                  ascending=[False, False, False]
              )
          else:
              matching_courses = matching_courses.sort_values(
                  by=['similarity_score', 'rating'],
                  ascending=[False, False]
              )

          # Remove temporary columns used for sorting
          matching_courses = matching_courses.drop(columns=['similarity_score', 'normalized_title'])
          if level:
              matching_courses = matching_courses.drop(columns=['level_match'])

          return matching_courses.head(top_n)

      def get_content_based_recommendations(self, course_idx, top_n=5):
          """
          Get content-based recommendations based on course similarity
          """
          try:
              course_row = self.df[self.df['course_idx'] == course_idx].index[0]
              similar_scores = self.content_similarity[course_row]
              similar_courses = list(enumerate(similar_scores))

              # Sort by similarity score
              similar_courses = sorted(similar_courses, key=lambda x: x[1], reverse=True)
              similar_courses = similar_courses[1:top_n+1]  # Exclude the input course

              course_indices = [i[0] for i in similar_courses]
              return self.df.iloc[course_indices]
          except IndexError:
              print(f"Course index {course_idx} not found")
              return pd.DataFrame(columns=self.df.columns)

  # Example usage function
  def test_course_search(cleaned_df, keyword, level=None):
      """
      Test the course search functionality with detailed output
      """
      print(f"\nTesting search for keyword: '{keyword}' with level: {level}")
      print("-" * 50)

      # Initialize recommender
      recommender = CourseRecommender(cleaned_df)

      # Perform search
      results = recommender.search_courses(keyword, level=level)

      # Display results
      if not results.empty:
          print(f"\nFound {len(results)} matching courses:")
          print("\nTop matching courses:")
          for idx, row in results.iterrows():
              print(f"\nTitle: {row['course_title']}")
              print(f"Level: {row['level']}")
              print(f"Rating: {row['rating']}")
              print("-" * 30)
      else:
          print("No matching courses found.")

      return results

  # Test the search functionality

  # Test with your dataset
  print("Testing search functionality...")

  # Example 1: Basic search
  results1 = test_course_search(cleaned_df, "Python")

  # Example 2: Search with level preference
  results2 = test_course_search(cleaned_df, "Python ", level="Beginner")

  # Example 3: Search with partial match
  results3 = test_course_search(cleaned_df, "data science")


Testing search functionality...

Testing search for keyword: 'Python' with level: None
--------------------------------------------------
No courses found matching keyword: Python
No matching courses found.

Testing search for keyword: 'Python ' with level: Beginner
--------------------------------------------------
No courses found matching keyword: Python 
No matching courses found.

Testing search for keyword: 'data science' with level: None
--------------------------------------------------
No courses found matching keyword: data science
No matching courses found.
