<h1 style="color: #232ED1;">📊 Course Recommender System on Web App</h1>



<h2 style="color: #232ED1;">Import Dependencies</h2>

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD
import pickle
import nltk
import re

# Download wordnet once (if needed)
try:
    nltk.data.find('corpora/wordnet')
except LookupError:
    nltk.download('wordnet')


print('Dependencies Imported')

Dependencies Imported


[nltk_data] Downloading package wordnet to /Users/bushra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<h2 style="color: #232ED1;">Load Dataset</h2>

In [2]:
data = pd.read_csv("coursera.csv", encoding='utf-8')

In [3]:
data.head()

Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...


<h2 style="color: #232ED1;">Basic Data Inspection</h2>

In [4]:
data.shape

(3522, 7)

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3522 entries, 0 to 3521
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Course Name         3522 non-null   object
 1   University          3522 non-null   object
 2   Difficulty Level    3522 non-null   object
 3   Course Rating       3522 non-null   object
 4   Course URL          3522 non-null   object
 5   Course Description  3522 non-null   object
 6   Skills              3522 non-null   object
dtypes: object(7)
memory usage: 192.7+ KB


In [6]:
data.isnull().sum()

Course Name           0
University            0
Difficulty Level      0
Course Rating         0
Course URL            0
Course Description    0
Skills                0
dtype: int64

In [7]:
data.nunique()

Course Name           3416
University             184
Difficulty Level         5
Course Rating           31
Course URL            3424
Course Description    3397
Skills                3424
dtype: int64

In [8]:
data.duplicated().sum()

np.int64(98)

In [9]:
# Remove duplicates based on specific columns
data = data.drop_duplicates(subset=['Course Name', 'University', 'Difficulty Level', 'Course Rating',
       'Course URL', 'Course Description'])
data.shape

(3424, 7)

<h2 style="color: #232ED1;">Text Preprocessing on Training Data</h2>

In [10]:
lemmatizer = WordNetLemmatizer()

# Function for text cleaning (removing special characters, stopwords, and lemmatization)
def clean_for_tags(text):
    text = re.sub(r'��+', '', text)  # This removes "��" or any repeated "��" characters
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Removes non-ASCII characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove anything that is not a letter or space
    text = text.lower()  # Convert text to lowercase
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatization
    return text

training_data = data.copy()

# Apply clean_for_tags on columns to be used in tags column
training_data['Course Name'] = training_data['Course Name'].apply(clean_for_tags)
training_data['Course Description'] = training_data['Course Description'].apply(clean_for_tags)
training_data['Skills'] = training_data['Skills'].apply(clean_for_tags)

# Combine 'Course Name', 'Course Description', and 'Skills' into 'tags'
data['tags'] = training_data['Course Name'] + ' ' + training_data['Course Description'] + ' ' + training_data['Skills']

training_data = data[['Course Name', 'tags']]

In [11]:
training_data.head()

Unnamed: 0,Course Name,tags
0,Write A Feature Length Screenplay For Film Or ...,write a feature length screenplay for film or ...
1,Business Strategy: Business Model Canvas Analy...,business strategy business model canvas analys...
2,Silicon Thin Film Solar Cells,silicon thin film solar cell this course consi...
3,Finance for Managers,finance for manager when it come to number the...
4,Retrieve Data using Single-Table SQL Queries,retrieve data using singletable sql query in t...


<h2 style="color: #232ED1;">Text Vectorization (TF-IDF)</h2>

In [12]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = vectorizer.fit_transform(training_data['tags'])
print("TF-IDF matrix shape:", tfidf_matrix.shape)

TF-IDF matrix shape: (3424, 5000)


<h2 style="color: #232ED1;">Apply SVD on TF-IDF</h2>

In [13]:
n_components = 100 # Reduce to 100 dimensions
svd = TruncatedSVD(n_components=n_components, random_state=42)
tfidf_matrix = svd.fit_transform(tfidf_matrix)

print("Reduced TF-IDF matrix shape:", tfidf_matrix.shape)


Reduced TF-IDF matrix shape: (3424, 100)


<h2 style="color: #232ED1;">Cosine Similarity and Recommendations</h2>

In [14]:
similarity_matrix = cosine_similarity(tfidf_matrix)
print(similarity_matrix[0][1])

0.023688637916059643


<h2 style="color: #232ED1;">Functions for Recommendation</h2>

In [15]:
def normalize_rating(rating_str):
    """
    Normalize the course rating to a 0-1 scale.
    """
    try:
        return (float(rating_str) - 0) / (5 - 0)  # Normalize to 0-1
    except ValueError:
        return 0  

In [17]:
def get_recommendations(course_name, data, similarity_matrix, top_n=3, threshold=90, rating_weight=0.05):
    """
    Get top N course recommendations based on similarity to the given course name.
    """
    course_name = data[data['Course Name'] == course_name]  # Filter data for selected course
    course_idx = course_name.index[0]  # Get the index of the selected course
    similarity_scores = list(enumerate(similarity_matrix[course_idx]))  # Get similarity scores for all courses
    
    recommendations = []
    for idx, similarity_score in sorted(similarity_scores, key=lambda x: x[1], reverse=True)[:top_n]:
        course_data = data.iloc[idx]  # Get course data for the current recommendation
        normalized_rating = normalize_rating(course_data.get('Course Rating', '0'))  # Normalize rating

        # Prepare recommendation dictionary with relevant course information
        recommendations.append({
            "course_name": course_data['Course Name'],
            "course_url": course_data.get('Course URL', ''),
            "rating": course_data['Course Rating'],
            "institution": course_data.get('University', 'Unknown'),
            "difficulty_level": course_data.get('Difficulty Level', 'Unknown'),
            "similarity": similarity_score,
            "final_score": similarity_score * (1 - rating_weight) + normalized_rating * rating_weight 
        })

    return sorted(recommendations, key=lambda x: x['final_score'], reverse=True)

In [18]:
get_recommendations('Finance for Managers', data, similarity_matrix)

[{'course_name': 'Finance for Managers',
  'course_url': 'https://www.coursera.org/learn/operational-finance',
  'rating': '4.8',
  'institution': 'IESE Business School',
  'difficulty_level': 'Intermediate',
  'similarity': np.float64(1.0),
  'final_score': np.float64(0.998)},
 {'course_name': 'Finance for Non-Financial Professionals',
  'course_url': 'https://www.coursera.org/learn/finance-for-non-finance-managers',
  'rating': '4.5',
  'institution': 'University of California, Irvine',
  'difficulty_level': 'Conversant',
  'similarity': np.float64(0.832940692838965),
  'final_score': np.float64(0.8362936581970167)},
 {'course_name': 'Finance for Non-Financial Managers',
  'course_url': 'https://www.coursera.org/learn/finance-for-non-financial-managers',
  'rating': '4.2',
  'institution': 'Emory University',
  'difficulty_level': 'Beginner',
  'similarity': np.float64(0.8315387472926317),
  'final_score': np.float64(0.8319618099280001)}]

<h2 style="color: #232ED1;">Save the Model</h2>

In [19]:
pickle.dump(similarity_matrix, open('similarity_matrix.pkl', 'wb'))