In [14]:
# Install required packages
print("=== INSTALLING REQUIRED PACKAGES ===")
!pip install sentence-transformers umap-learn matplotlib
!pip install --quiet "umap-learn[plot]"


=== INSTALLING REQUIRED PACKAGES ===
Defaulting to user installation because normal site-packages is not writeable
Collecting umap-learn
  Using cached umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Using cached pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Using cached umap_learn-0.5.7-py3-none-any.whl (88 kB)
Using cached pynndescent-0.5.13-py3-none-any.whl (56 kB)
Installing collected packages: pynndescent, umap-learn
Successfully installed pynndescent-0.5.13 umap-learn-0.5.7


In [15]:
# -*- coding: utf-8 -*-
"""
This script generates content-based embeddings for the blog dataset.

It performs the following steps:
1.  Loads the preprocessed blog metadata.
2.  Creates a combined text field from the blog's title, content, and topic.
3.  Uses a pre-trained Sentence Transformer model to generate embeddings for this
    combined text.
4.  Saves the generated embeddings and the corresponding blog IDs to a pickle
    file in the 'models' directory.
5.  Includes a function to demonstrate how to use the saved embeddings to find
    similar blogs.
"""


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from umap import UMAP
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

# For better display in Colab
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 150)

RuntimeError: Failed to import transformers.integrations.integration_utils because of the following error (look up to see its traceback):
Failed to import transformers.modeling_tf_utils because of the following error (look up to see its traceback):
Your currently installed version of Keras is Keras 3, but this is not yet supported in Transformers. Please install the backwards-compatible tf-keras package with `pip install tf-keras`.

In [None]:
# --- 1. SETUP AND CONFIGURATION ---

print("=== SCRIPT START: Content-Based Model Generation ===")

# Define base paths to locate data and save models
# Assumes the script is run from the 'src' directory
BASE_PATH = os.path.dirname(os.getcwd())
PROCESSED_DATA_PATH = os.path.join(BASE_PATH, "data", "processed")
MODELS_PATH = os.path.join(BASE_PATH, "models")

# Define file paths for input and output
INPUT_BLOG_DATA_PATH = os.path.join(PROCESSED_DATA_PATH,
                                    "cleaned_blog_metadata.pkl")
OUTPUT_EMBEDDING_MODEL_PATH = os.path.join(MODELS_PATH, "embedding_model.pkl")

# Ensure the 'models' directory exists
if not os.path.exists(MODELS_PATH):
    print(f"Creating directory: {MODELS_PATH}")
    os.makedirs(MODELS_PATH)

In [3]:
base_path = os.path.dirname(os.path.dirname(os.getcwd()))
rawdata_path = os.path.join(base_path, "data" ,"raw")
processeddata_path = os.path.join(base_path, "data", "processed") 
if not os.path.exists(processeddata_path):
    os.makedirs(processeddata_path)
src_dir= os.path.join(base_path, "src")
curr_path= os.getcwd()

print(f"base directory: {base_path}")
print(f"raw data directory: {rawdata_path}")
print(f"processed data directory: {processeddata_path}")
print(f"src directory: {src_dir}")
print(f"current directory: {curr_path}")

base directory: c:\Users\Abhishek\Downloads\EMLYON CLASSES\PYTHON\35. RECOMMENDATION SYSTEMS\BlogRecommendation
raw data directory: c:\Users\Abhishek\Downloads\EMLYON CLASSES\PYTHON\35. RECOMMENDATION SYSTEMS\BlogRecommendation\data\raw
processed data directory: c:\Users\Abhishek\Downloads\EMLYON CLASSES\PYTHON\35. RECOMMENDATION SYSTEMS\BlogRecommendation\data\processed
src directory: c:\Users\Abhishek\Downloads\EMLYON CLASSES\PYTHON\35. RECOMMENDATION SYSTEMS\BlogRecommendation\src
current directory: c:\Users\Abhishek\Downloads\EMLYON CLASSES\PYTHON\35. RECOMMENDATION SYSTEMS\BlogRecommendation\src\notebooks


In [6]:
df= pd.read_pickle(os.path.join(processeddata_path, "cleaned_blog_ratings.pkl"))
train_df=pd.read_pickle(os.path.join(processeddata_path, "train_ratings.pkl"))
test_df=pd.read_pickle(os.path.join(processeddata_path, "test_ratings.pkl"))    

print(f"df shape : {df.shape}")
display(df.head())

print(f"train_df shape : {train_df.shape}")
display(train_df.head())

print(f"test_df shape : {test_df.shape}")
display(test_df.head())

df shape : (200140, 12)


Unnamed: 0,blog_id,user_id,ratings,author_id,blog_title,blog_content,blog_link,blog_img,topic,scrape_time,scrape_date,author_name
0,9025,11,3.5,5960,How I became a Frontend Developer,A little bit of background about me: as a teen...,https://medium.com/@steven.dornan93/how-i-beca...,https://miro.medium.com/v2/resize:fill:140:140...,web-development,2023-04-04 08:53:52,2023-04-04,Steven Dornan
1,9320,11,5.0,6155,Writing an Algorithm to Calculate Article Read...,You have probably noticed a read-time number u...,https://medium.com/@dpericich/writing-an-algor...,https://miro.medium.com/v2/resize:fill:140:140...,web-development,2023-04-04 08:53:52,2023-04-04,Daniel Pericich
2,9246,11,3.5,6114,Diving into HTML and the Tools of the Trade,It’s been an incredible first week as a Bytewi...,https://medium.com/@muhammadnaeemtahir/diving-...,https://miro.medium.com/v2/resize:fill:140:140...,web-development,2023-04-04 08:53:52,2023-04-04,Muhammad Naeem Tahir
3,9431,11,5.0,2386,Learning Too Many Programming Languages at Once?,Learning too many programming languages at onc...,https://medium.com/@mohit-singh/learning-too-m...,https://miro.medium.com/v2/resize:fill:140:140...,web-development,2023-04-04 08:53:52,2023-04-04,Mohit Singh
4,875,11,2.0,699,Cryptocurrency Regulations: A Tug of War Betwe...,"Once upon a time in the wild, wild world of cr...",https://medium.com/@Juan_In_The_Chain/cryptocu...,https://miro.medium.com/v2/resize:fill:140:140...,blockchain,2023-04-03 06:06:20,2023-04-03,Juan In The Chain


train_df shape : (194727, 12)


Unnamed: 0,blog_id,user_id,ratings,author_id,blog_title,blog_content,blog_link,blog_img,topic,scrape_time,scrape_date,author_name
0,9025,11,3.5,5960,How I became a Frontend Developer,A little bit of background about me: as a teen...,https://medium.com/@steven.dornan93/how-i-beca...,https://miro.medium.com/v2/resize:fill:140:140...,web-development,2023-04-04 08:53:52,2023-04-04,Steven Dornan
1,9320,11,5.0,6155,Writing an Algorithm to Calculate Article Read...,You have probably noticed a read-time number u...,https://medium.com/@dpericich/writing-an-algor...,https://miro.medium.com/v2/resize:fill:140:140...,web-development,2023-04-04 08:53:52,2023-04-04,Daniel Pericich
2,9246,11,3.5,6114,Diving into HTML and the Tools of the Trade,It’s been an incredible first week as a Bytewi...,https://medium.com/@muhammadnaeemtahir/diving-...,https://miro.medium.com/v2/resize:fill:140:140...,web-development,2023-04-04 08:53:52,2023-04-04,Muhammad Naeem Tahir
3,9431,11,5.0,2386,Learning Too Many Programming Languages at Once?,Learning too many programming languages at onc...,https://medium.com/@mohit-singh/learning-too-m...,https://miro.medium.com/v2/resize:fill:140:140...,web-development,2023-04-04 08:53:52,2023-04-04,Mohit Singh
4,875,11,2.0,699,Cryptocurrency Regulations: A Tug of War Betwe...,"Once upon a time in the wild, wild world of cr...",https://medium.com/@Juan_In_The_Chain/cryptocu...,https://miro.medium.com/v2/resize:fill:140:140...,blockchain,2023-04-03 06:06:20,2023-04-03,Juan In The Chain


test_df shape : (5413, 12)


Unnamed: 0,blog_id,user_id,ratings,author_id,blog_title,blog_content,blog_link,blog_img,topic,scrape_time,scrape_date,author_name
0,2080,24,2.0,1661,Convolution: The Pixel Perfect Technique Behin...,Definition Convolution is a fundamental operat...,https://medium.com/@aatish_kayyath/convolution...,https://miro.medium.com/v2/resize:fill:140:140...,image-processing,2023-04-03 06:43:15,2023-04-03,Aatish Kayyath
1,2152,24,3.5,1704,Intel Image Classification using CNNs and diff...,I am reading “Deep Learning with Python” by Fr...,https://medium.com/@shanmuka.sadhu/intel-image...,https://miro.medium.com/v2/resize:fill:140:140...,image-processing,2023-04-03 06:43:15,2023-04-03,Shanmuka Sadhu
2,2137,24,5.0,1696,Can AI save bad scans?,The starting point for any kind of document di...,https://medium.com/transkribus/can-ai-save-bad...,https://miro.medium.com/v2/resize:fill:140:140...,image-processing,2023-04-03 06:43:15,2023-04-03,Fiona Park
3,2215,24,0.5,1662,How can i observe a 3 dimensional matrix(hdr) ...,"I have with me, file={'img1.jpg','img2.jpg','i...",https://medium.com/@technicalsource9/how-can-i...,https://miro.medium.com/v2/resize:fill:140:140...,image-processing,2023-04-03 06:43:15,2023-04-03,Technical Source
4,7105,24,5.0,4195,Overview of Autoencoders,Autoencoders are a type of neural network that...,https://medium.com/@dongreanay/overview-of-aut...,https://miro.medium.com/v2/resize:fill:140:140...,image-processing,2023-04-04 08:53:52,2023-04-04,Anay Dongre


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200140 entries, 0 to 200139
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   blog_id       200140 non-null  int64         
 1   user_id       200140 non-null  int64         
 2   ratings       200140 non-null  float64       
 3   author_id     200140 non-null  int64         
 4   blog_title    200140 non-null  object        
 5   blog_content  200140 non-null  object        
 6   blog_link     200140 non-null  object        
 7   blog_img      200140 non-null  object        
 8   topic         200140 non-null  object        
 9   scrape_time   200140 non-null  datetime64[ns]
 10  scrape_date   200140 non-null  object        
 11  author_name   200140 non-null  object        
dtypes: datetime64[ns](1), float64(1), int64(3), object(7)
memory usage: 18.3+ MB
