In [40]:

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error, f1_score
from sklearn.model_selection import StratifiedShuffleSplit
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [41]:

path = "/content/drive/MyDrive/Online_courses_final1_updated.csv"
course = pd.read_csv(path)


course['tag'] = course['Category'] + ' ' + course['Sub-Category'] + ' ' + course['Skills']

new_df = course[['CourseID', 'Title', 'tag', 'Rating']].drop_duplicates(subset=['Title']).dropna()

new_df['Rating_Binned'] = pd.cut(new_df['Rating'], bins=[0, 2, 3, 4, 5], labels=[1, 2, 3, 4])



In [42]:
strat_split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in strat_split.split(new_df, new_df['Rating_Binned']):
    train = new_df.iloc[train_index]
    test = new_df.iloc[test_index]

cv = CountVectorizer(max_features=10000, stop_words='english')
train_vectors = cv.fit_transform(train['tag'].values.astype('U')).toarray()

sim = cosine_similarity(train_vectors)



In [43]:
def predict_ratings(title, train, sim):
    try:
        index = train[train['Title'] == title].index[0]
        similar_indices = list(enumerate(sim[index]))
        sorted_similar_indices = sorted(similar_indices, key=lambda x: x[1], reverse=True)

        top_similar = sorted_similar_indices[1:11]  # Exclude self
        weighted_sum = sum(similarity * train.iloc[i]['Rating'] for i, similarity in top_similar)
        sum_weights = sum(similarity for i, similarity in top_similar)

        return weighted_sum / sum_weights if sum_weights > 0 else np.nan
    except:
        return np.nan

train_mean_rating = train['Rating'].mean()  # Fallback for missing predictions
test['predicted_ratings'] = test['Title'].apply(
    lambda x: predict_ratings(x, train, sim) if x in train['Title'].values else train_mean_rating
)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predicted_ratings'] = test['Title'].apply(


In [44]:
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(test['Rating'], test['predicted_ratings']))
print(f"RMSE: {rmse:.2f}")

# Binary relevance for F1 score
test['true_relevance'] = test['Rating'].apply(lambda x: 1 if x >= 4 else 0)
test['predicted_relevance'] = test['predicted_ratings'].apply(lambda x: 1 if x >= 4 else 0)

# Calculate F1 Score
f1 = f1_score(test['true_relevance'], test['predicted_relevance'])
print(f"F1 Score: {f1:.2f}")


RMSE: 0.20
F1 Score: 0.99


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['true_relevance'] = test['Rating'].apply(lambda x: 1 if x >= 4 else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['predicted_relevance'] = test['predicted_ratings'].apply(lambda x: 1 if x >= 4 else 0)
