In [15]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import difflib  # For fuzzy matching

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer


In [17]:
# Load Dataset
print("Loading Dataset...")
data = pd.read_csv("Coursera.csv")
print("Dataset loaded successfully!")

# Basic Dataset Information
print(f"Dataset Shape: {data.shape}")
print(data.info())

# Display first 5 rows
data.head()


Loading Dataset...
Dataset loaded successfully!
Dataset Shape: (3522, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3522 entries, 0 to 3521
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Course Name         3522 non-null   object
 1   University          3522 non-null   object
 2   Difficulty Level    3522 non-null   object
 3   Course Rating       3522 non-null   object
 4   Course URL          3522 non-null   object
 5   Course Description  3522 non-null   object
 6   Skills              3522 non-null   object
dtypes: object(7)
memory usage: 192.7+ KB
None


Unnamed: 0,Course Name,University,Difficulty Level,Course Rating,Course URL,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Michigan State University,Beginner,4.8,https://www.coursera.org/learn/write-a-feature...,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Coursera Project Network,Beginner,4.8,https://www.coursera.org/learn/canvas-analysis...,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,�cole Polytechnique,Advanced,4.1,https://www.coursera.org/learn/silicon-thin-fi...,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,IESE Business School,Intermediate,4.8,https://www.coursera.org/learn/operational-fin...,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Coursera Project Network,Beginner,4.6,https://www.coursera.org/learn/single-table-sq...,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...


In [19]:
# Check for missing values
print("\nMissing Values per Column:\n", data.isnull().sum())



Missing Values per Column:
 Course Name           0
University            0
Difficulty Level      0
Course Rating         0
Course URL            0
Course Description    0
Skills                0
dtype: int64


In [21]:
# Selecting relevant columns
data = data[['Course Name', 'Difficulty Level', 'Course Description', 'Skills']]
data.head()


Unnamed: 0,Course Name,Difficulty Level,Course Description,Skills
0,Write A Feature Length Screenplay For Film Or ...,Beginner,Write a Full Length Feature Film Script In th...,Drama Comedy peering screenwriting film D...
1,Business Strategy: Business Model Canvas Analy...,Beginner,"By the end of this guided project, you will be...",Finance business plan persona (user experien...
2,Silicon Thin Film Solar Cells,Advanced,This course consists of a general presentation...,chemistry physics Solar Energy film lambda...
3,Finance for Managers,Intermediate,"When it comes to numbers, there is always more...",accounts receivable dupont analysis analysis...
4,Retrieve Data using Single-Table SQL Queries,Beginner,In this course you�ll learn how to effectively...,Data Analysis select (sql) database manageme...


In [54]:
# Cleaning text by removing unwanted characters
def clean_text(text):
    text = text.replace(':', '')  # Remove colons
    text = text.replace('(', '').replace(')', '')  # Remove parentheses
    text = text.replace('_', '')  # Remove underscores
    text = text.replace(',,', ',')  # Remove double commas
    return text.strip()  # Strip leading and trailing spaces
data['Course Name'] = data['Course Name'].apply(clean_text)
data['Course Description'] = data['Course Description'].apply(clean_text)
data['Skills'] = data['Skills'].apply(lambda x: clean_text(x))


In [56]:
# Creating 'tags' column by combining course name, difficulty, description, and skills
data['tags'] = data['Course Name'] + " " + data['Difficulty Level'] + " " + data['Course Description'] + " " + data['Skills']

# Convert to lowercase and remove extra commas
data['tags'] = data['tags'].str.replace(',', ' ').str.lower()

# Creating a new dataframe for recommendations
new_df = data[['Course Name', 'tags']].rename(columns={'Course Name': 'course_name'})

# Display first 5 rows
new_df.head()


Unnamed: 0,course_name,tags
0,"Write,A,Feature,Length,Screenplay,For,Film,Or,...",write a feature length screenplay for film or ...
1,"Business,Strategy,Business,Model,Canvas,Analys...",business strategy business model canvas analys...
2,"Silicon,Thin,Film,Solar,Cells",silicon thin film solar cells advanced this co...
3,"Finance,for,Managers",finance for managers intermediate when it come...
4,"Retrieve,Data,using,Single-Table,SQL,Queries",retrieve data using single-table sql queries b...


In [58]:
# Convert text into numerical feature vectors
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(new_df['tags']).toarray()


In [59]:
# Initialize the Porter Stemmer
ps = PorterStemmer()

# Function to apply stemming
def stem(text):
    return " ".join([ps.stem(word) for word in text.split()])

# Apply stemming on the tags column
new_df['tags'] = new_df['tags'].apply(stem)

# Display the updated dataframe
new_df.head()


Unnamed: 0,course_name,tags
0,"Write,A,Feature,Length,Screenplay,For,Film,Or,...",write a featur length screenplay for film or t...
1,"Business,Strategy,Business,Model,Canvas,Analys...",busi strategi busi model canva analysi with mi...
2,"Silicon,Thin,Film,Solar,Cells",silicon thin film solar cell advanc thi cours ...
3,"Finance,for,Managers",financ for manag intermedi when it come to num...
4,"Retrieve,Data,using,Single-Table,SQL,Queries",retriev data use single-t sql queri beginn in ...


In [60]:
# Compute cosine similarity between course vectors
similarity = cosine_similarity(vectors)

In [61]:
# Function to recommend similar courses
def recommend(course):
    course_list = new_df['course_name'].tolist()

    # Find the closest match using fuzzy matching
    close_matches = difflib.get_close_matches(course, course_list, n=1, cutoff=0.6)
    
    if not close_matches:
        print(f"Error: Course '{course}' not found. Try another search term.")
        return

    # Get the index of the closest matched course
    course_index = new_df[new_df['course_name'] == close_matches[0]].index[0]
    distances = similarity[course_index]

    # Get the top 6 most similar courses (excluding itself)
    course_recommendations = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:7]

    print(f"\nRecommended Courses for '{close_matches[0]}':")
    for i in course_recommendations:
        print(f"- {new_df.iloc[i[0]].course_name}")

# Test the recommendation function
recommend('Business Strategy Business Model Canvas Analysis with Miro')



Recommended Courses for 'Business,Strategy,Business,Model,Canvas,Analysis,with,Miro':
- Product,Development,Customer,Persona,Development,with,Miro
- Product,and,Service,Development,Empathy,Mapping,with,Miro
- Product,Development,Customer,Journey,Mapping,with,Miro
- Analyzing,Macro-Environmental,Factors,Using,Creately
- Innovating,with,the,Business,Model,Canvas
- Analyzing,Market,Attractiveness,Using,Creately


In [63]:
# Save similarity matrix and course data for future use
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(new_df.to_dict(), open('course_list.pkl', 'wb'))  # Contains the dataframe in dict form
pickle.dump(new_df, open('courses.pkl', 'wb'))

print("\nPickle files saved successfully!")



Pickle files saved successfully!
