In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Read the CSV file
data = pd.read_csv("Coursera_courses.csv")

# Select columns of interest
data = data[['course_title', 'course_skills', 'course_difficulty']]



**DATA PREPROCESSING**

In [2]:
# Rename columns
data.rename(columns={'course_title': 'Course Name', 
                     'course_skills': 'Skills', 
                     'course_difficulty': 'Difficulty Level'}, inplace=True)
# Removing spaces between the words (Lambda funtions can be used as well)

data['Course Name'] = data['Course Name'].str.replace(' ',',')
data['Course Name'] = data['Course Name'].str.replace(',,',',')
data['Course Name'] = data['Course Name'].str.replace(':','')
# Create 'tags' column by concatenating other columns
data['tags'] = data['Course Name'] + ' ' + data['Difficulty Level'] + ' ' + data['Skills']

data.head(5)


Unnamed: 0,Course Name,Skills,Difficulty Level,tags
0,"Google,Data,Analytics","Data Analysis, SQL, R Programming, Business Co...",Beginner,"Google,Data,Analytics Beginner Data Analysis, ..."
1,"Google,Cybersecurity","Network Security, Python Programming, Linux, C...",Beginner,"Google,Cybersecurity Beginner Network Security..."
2,"Google,Project,Management","Project Management, Strategy and Operations, L...",Beginner,"Google,Project,Management Beginner Project Man..."
3,"Google,Digital,Marketing,&,E-commerce","Digital Marketing, Marketing, Marketing Manage...",Beginner,"Google,Digital,Marketing,&,E-commerce Beginner..."
4,"IBM,Data,Science","Python Programming, Data Science, Machine Lear...",Beginner,"IBM,Data,Science Beginner Python Programming, ..."


In [3]:
data['tags'].iloc[1]

'Google,Cybersecurity Beginner Network Security, Python Programming, Linux, Cloud Computing, Algorithms, Audit, Computer Programming, Computer Security Incident Management, Cryptography, Databases, Leadership and Management, Network Architecture, Risk Management, SQL'

**DATAFRAME TO BE USED**

In [4]:
# Create a new DataFrame with selected columns
new_df = data[['Course Name', 'tags']]

# Remove commas from 'tags' and 'Course Name', and convert 'tags' to lowercase
new_df['tags'] = new_df['tags'].str.replace(',', ' ').str.lower()
new_df['Course Name'] = new_df['Course Name'].str.replace(',', ' ')

# Rename 'Course Name' column to 'course_name'
new_df.rename(columns={'Course Name': 'course_name'}, inplace=True)

# Display the first 5 rows of the new DataFrame
new_df.head(5)

data['Course Name'] = data['Course Name'].str.replace(',', ' ')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].str.replace(',', ' ').str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['Course Name'] = new_df['Course Name'].str.replace(',', ' ')
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df.rename(columns={'Course Name': 'course_name'}, inplace=True)


**TEXT VECTORIZATION**

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000,stop_words='english')

vectors = cv.fit_transform(new_df['tags']).toarray()



**STEMMING PROCESS**

In [6]:
import nltk 
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
#defining the stemming function
def stem(text):
    y=[]
    
    for i in text.split():
        y.append(ps.stem(i))
    
    return " ".join(y)
new_df['tags'] = new_df['tags'].apply(stem) #applying stemming on the tags column

new_df['tags']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem) #applying stemming on the tags column


0      googl data analyt beginn data analysi sql r pr...
1      googl cybersecur beginn network secur python p...
2      googl project manag beginn project manag strat...
3      googl digit market & e-commerc beginn digit ma...
4      ibm data scienc beginn python program data sci...
                             ...                        
878    write a featur length screenplay for film or t...
879    camino a la excelencia en gestión de proyecto ...
880    use databas with python beginn comput program ...
881    cybersecur role process & oper system secur be...
882    project manag foundat initi and plan beginn pr...
Name: tags, Length: 883, dtype: object

**SIMILARITY MEASURE**

In [7]:
#similarity measure
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

**RECOMMENDATION FUNCTION**

In [8]:
 def recommend(course):
    # Check if the course exists in the dataset
    if course not in new_df['course_name'].values:
        print("Course not found.")
        return
    
    # Find index of the course in the new_df DataFrame
    course_index = new_df[new_df['course_name'] == course].index[0]
    distances = similarity[course_index]
    course_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:7]
    
    for i in course_list:
        recommended_course = new_df.iloc[i[0]].course_name
        # Find the corresponding difficulty level from the original data DataFrame
        difficulty_level = data.loc[data['Course Name'] == recommended_course, 'Difficulty Level'].iloc[0]
        print(recommended_course, "Difficulty Level:", difficulty_level)


**RECOMMENDING COURSES**

In [13]:
recommend('Data Science')

Data Science Statistics and Machine Learning Difficulty Level: Intermediate
Applied Data Science with Python Difficulty Level: Intermediate
Business Analytics Difficulty Level: Beginner
Data Science Foundations using R Difficulty Level: Beginner
Applied Data Science Difficulty Level: Beginner
Data Analysis with Python Difficulty Level: Beginner


In [10]:
recommend('Google Data Analytics')

Google データアナリティクス Difficulty Level: Beginner
Data Analysis and Visualization Foundations Difficulty Level: Beginner
IBM Data Analytics with Excel and R Difficulty Level: Beginner
Google Data Analytics (PT) Difficulty Level: Beginner
Data Analysis and Presentation Skills the PwC Approach Difficulty Level: Beginner
Foundations Data Data Everywhere Difficulty Level: Beginner


In [11]:
recommend('Machine Learning')

Advanced Learning Algorithms Difficulty Level: Beginner
Unsupervised Learning Recommenders Reinforcement Learning Difficulty Level: Beginner
Supervised Machine Learning Regression and Classification Difficulty Level: Beginner
Sequence Models Difficulty Level: Intermediate
Introduction to TensorFlow for Artificial Intelligence Machine Learning and Deep Learning Difficulty Level: Intermediate
Convolutional Neural Networks in TensorFlow Difficulty Level: Intermediate
