In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
from google.colab import files # Upload CSV file
uploaded = files.upload() # List the uploaded files

Saving Coursera.csv to Coursera.csv
Saving edx.csv to edx.csv
Saving skillshare.csv to skillshare.csv
Saving Udemy.csv to Udemy.csv


In [3]:
coursera_df = pd.read_csv('Coursera.csv')
edx_df = pd.read_csv('edx.csv')
skillshare_df = pd.read_csv('skillshare.csv')
udemy_df = pd.read_csv('Udemy.csv')

In [4]:
# Combine datasets into a single DataFrame
coursera_df = coursera_df [['course', 'skills', 'rating']]
edx_df = edx_df[['title', 'associatedskills', 'level']]
skillshare_df = skillshare_df[['title', 'instructor', 'duration']]
udemy_df = udemy_df[['title', 'description', 'rating']]

In [5]:
# Rename columns for consistency
coursera_df.columns = ['title', 'description', 'rating']
edx_df.columns = ['title', 'description', 'rating']
skillshare_df.columns = ['title', 'description', 'rating']
udemy_df.columns = ['title', 'description', 'rating']

In [6]:
# Concatenate all datasets
all_courses = pd.concat([coursera_df, edx_df, skillshare_df, udemy_df], ignore_index=True)

# Fill missing values with empty strings
all_courses['description'] = all_courses['description'].fillna('')

In [7]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(all_courses['description'])

# Cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [8]:
def recommend_courses(title, num_recommendations=5):
    # Find the index of the course matching the title
    idx = all_courses[all_courses['title'].str.contains(title, case=False, na=False)].index
    if len(idx) == 0:
        return f"Course '{title}' not found."

    idx = idx[0]  # Use the first match

    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort courses based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get indices of the most similar courses
    sim_indices = [i[0] for i in sim_scores[1:num_recommendations + 1]]

    # Return the titles of the recommended courses
    return all_courses.iloc[sim_indices][['title', 'rating']]

In [10]:
# Example usage
recommendations = recommend_courses('data analytics', num_recommendations=5)
print(recommendations)

                                                 title rating
348                                  Google データアナリティクス    4.7
135  Data Analysis and Presentation Skills: the PwC...    4.6
491   Excel to MySQL: Analytic Techniques for Business    4.6
206                         Google Data Analytics (PT)    4.9
85                         Análisis de Datos de Google    4.8


In [None]:
from google.colab import drive
drive.mount('/content/drive')