In [1]:
import pandas as pd
import numpy as np



In [2]:
data = pd.read_csv('medium-data-science-articles-2020.csv')

In [3]:
data.head()

Unnamed: 0,url,title,author,author_page,subtitle,claps,responses,reading_time,tag,date
0,https://towardsdatascience.com/making-python-p...,Making Python Programs Blazingly Fast,martin.heinz,https://towardsdatascience.com/@martin.heinz,Let’s look at the performance of our Python pr...,3300.0,3,5,Data Science,2020-01-01
1,https://towardsdatascience.com/how-to-be-fancy...,How to be fancy with Python,dipam44,https://towardsdatascience.com/@dipam44,Python tricks that will make your life easier,1700.0,12,5,Data Science,2020-01-01
2,https://uxdesign.cc/how-exactly-do-you-find-in...,How exactly do you find insights from qualitat...,taylornguyen144,https://uxdesign.cc/@taylornguyen144,Visualizing the synthesis processes…,1100.0,3,4,Data Science,2020-01-01
3,https://towardsdatascience.com/from-scratch-to...,From scratch to search: playing with your data...,stanislavprihoda,https://towardsdatascience.com/@stanislavprihoda,One Pipeline to rule…,232.0,1,9,Data Science,2020-01-01
4,https://www.cantorsparadise.com/the-waiting-pa...,The Waiting Paradox: An Intro to Probability D...,maikeelisa,https://www.cantorsparadise.com/@maikeelisa,How much longer do I have to wait for my…,859.0,5,8,Data Science,2020-01-01


In [4]:
data.shape

(108021, 10)

In [5]:
df = data.drop('author_page', axis=1)

In [6]:
data = df.drop('reading_time', axis=1)

In [7]:
data.shape

(108021, 8)

In [8]:
tag_counts = data['tag'].value_counts()

In [9]:
tag_counts

tag
Data Science              45320
Machine Learning          25539
Artificial Inteligence    15764
Data                      10103
Big Data                   4210
Deep Learning              3815
Analytics                  3270
Name: count, dtype: int64

In [10]:

missing_values = data.isnull().sum()

print("Missing Values Count per Column:")
print(missing_values)


Missing Values Count per Column:
url              0
title            0
author           0
subtitle     69435
claps            0
responses        0
tag              0
date             0
dtype: int64


In [11]:

data = data.fillna(0)


In [12]:

missing_values = data.isnull().sum()

print("Missing Values Count per Column:")
print(missing_values)


Missing Values Count per Column:
url          0
title        0
author       0
subtitle     0
claps        0
responses    0
tag          0
date         0
dtype: int64


In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import AutoTokenizer
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
# Load a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [15]:
data['text'] = (
    data['title'].astype(str) + " " +
    data['subtitle'].astype(str) + " " +
    data['tag'].astype(str) + " " +
    data['author'].astype(str)
)


In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:

# Normalize numeric features
numeric_features = ['claps', 'responses']
scaler = StandardScaler()
data[numeric_features] = scaler.fit_transform(data[numeric_features])

text_embeddings = []

In [18]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [19]:

for index, row in data.iterrows():
    # Tokenize the text
    tokenized = tokenizer(row['text'], padding="max_length", truncation=True, max_length=32, return_tensors='pt')
    
    input_ids = tokenized['input_ids']
    attention_mask = tokenized['attention_mask']

    # Forward pass through the BertModel
    with torch.no_grad():
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)

    # Extract the embeddings
    embeddings = outputs.last_hidden_state.mean(dim=1)

    # numeric features
    numeric_features_array = row[numeric_features].values.astype(float)  
    numeric_features_tensor = torch.tensor(numeric_features_array, dtype=torch.float32)
    
    # Concatenate
    combined_embedding = torch.cat((embeddings.squeeze(), numeric_features_tensor))

    # Convert to numpy and append to the list
    text_embeddings.append(combined_embedding.numpy())

# Convert the list of embeddings to a single NumPy array
text_embeddings = np.array(text_embeddings)

# Convert the NumPy array to a torch tensor
text_embeddings = torch.tensor(text_embeddings)

In [20]:
text_embeddings

tensor([[ 0.1512, -0.0150,  0.4095,  ...,  0.0179, 12.8987,  1.4714],
        [ 0.1384, -0.1223,  0.1262,  ...,  0.0268,  6.5005,  6.4594],
        [-0.1321,  0.0858,  0.6481,  ..., -0.0470,  4.1012,  1.4714],
        ...,
        [-0.1844, -0.0774,  0.3101,  ...,  0.1542, -0.2976, -0.1913],
        [-0.1561,  0.2101,  0.3557,  ..., -0.0642, -0.2976, -0.1913],
        [-0.0727,  0.1133,  0.2001,  ...,  0.1017, -0.2976, -0.1913]])

In [21]:
num_urls = len(data['url'].unique())

In [22]:
# Create a TensorDataset
dataset = TensorDataset(text_embeddings)

In [23]:

# Create a DataLoader
batch_size = 16
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [35]:
# The model
class RecommendationModel(nn.Module):
    def __init__(self, input_size, num_urls):
        super(RecommendationModel, self).__init__()
        self.fc = nn.Linear(input_size, num_urls)

    def forward(self, embeddings):
        scores = self.fc(embeddings.squeeze(dim=1))
        return scores


In [28]:
device = torch.device("cuda")

In [36]:
# Create an instance of the model
input_size = text_embeddings.size(1)
recommendation_model = RecommendationModel(input_size, num_urls)

In [37]:
print(recommendation_model)

RecommendationModel(
  (fc): Linear(in_features=770, out_features=107971, bias=True)
)


In [38]:

optimizer = torch.optim.AdamW(recommendation_model.parameters(), lr=0.001)


In [34]:

for batch_idx, batch in enumerate(dataloader):
    embeddings = batch
    
    print(f"Batch {batch_idx + 1} - Number of Embeddings: {len(embeddings)}")
    
    print("Embeddings (first few):", embeddings[:5])
    
    break

Batch 1 - Number of Embeddings: 1
Embeddings (first few): [tensor([[ 1.5117e-01, -1.4954e-02,  4.0948e-01,  ...,  1.7921e-02,
          1.2899e+01,  1.4714e+00],
        [ 1.3840e-01, -1.2232e-01,  1.2619e-01,  ...,  2.6786e-02,
          6.5005e+00,  6.4594e+00],
        [-1.3212e-01,  8.5801e-02,  6.4806e-01,  ..., -4.7015e-02,
          4.1012e+00,  1.4714e+00],
        ...,
        [ 2.3730e-01,  1.5199e-01,  4.0307e-01,  ...,  2.1543e-01,
          1.5028e-01,  3.6291e-01],
        [ 2.9161e-02, -1.1290e-02,  3.4831e-01,  ...,  1.5476e-02,
          8.2209e-01, -1.9132e-01],
        [ 1.6088e-01, -1.3701e-01,  2.4973e-01,  ...,  2.7411e-01,
          4.7419e-01, -1.9132e-01]])]


In [39]:
from tqdm import tqdm


# Training loop 
num_epochs = 3

for epoch in range(num_epochs):
    recommendation_model.train()
    total_loss = 0.0
    
    
    for batch_idx, batch in enumerate(tqdm(dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch')):
        # Extract embeddings
        embeddings = batch[0]

        # Forward pass
        scores = recommendation_model(embeddings)

        # Generate random negative samples 
        negative_samples = torch.randint(high=num_urls, size=scores.shape, dtype=torch.long)

        # Calculate pairwise ranking loss
        loss = nn.MarginRankingLoss(margin=1.0)(scores, negative_samples, torch.ones_like(scores))

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # Accumulate total loss
        total_loss += loss.item()

    
    average_loss = total_loss / (batch_idx + 1) 
    print(f'Epoch {epoch + 1}/{num_epochs}, Average Loss: {average_loss:.4f}')

Epoch 1/3: 100%|██████████| 6752/6752 [1:28:49<00:00,  1.27batch/s]


Epoch 1/3, Average Loss: 53671.5213


Epoch 2/3: 100%|██████████| 6752/6752 [1:28:17<00:00,  1.27batch/s]


Epoch 2/3, Average Loss: 53071.8888


Epoch 3/3: 100%|██████████| 6752/6752 [1:31:59<00:00,  1.22batch/s]

Epoch 3/3, Average Loss: 52515.7770





In [40]:
# save the model
torch.save(recommendation_model.state_dict(), 'recommendation_model.pth')

In [44]:
recommendation_model.eval()

RecommendationModel(
  (fc): Linear(in_features=770, out_features=107971, bias=True)
)

In [48]:
input_text = "learning python"

# Tokenize input text
tokenized = tokenizer(input_text, padding="max_length", truncation=True, max_length=32, return_tensors='pt')

# Extract input_ids and attention_mask
input_ids = tokenized['input_ids']
attention_mask = tokenized['attention_mask']

# Forward pass through the BertModel
with torch.no_grad():
    outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)

# Extract the embeddings
embeddings = outputs.last_hidden_state.mean(dim=1)

# Include numeric features
combined_embedding = embeddings.squeeze()

# Convert to NumPy and append to the list
text_embeddings_list = []
text_embeddings_list.append(combined_embedding.numpy())

# Convert the list of embeddings to a single NumPy array
text_embeddings = torch.tensor(text_embeddings_list)


  text_embeddings = torch.tensor(text_embeddings_list)


In [49]:
text_embeddings

tensor([[ 6.2375e-02, -1.7897e-01,  2.8450e-01,  9.4814e-02,  1.8734e-01,
          5.4046e-02, -8.2771e-02, -1.5231e-01,  5.3404e-02, -1.8147e-01,
         -9.3207e-02,  1.1508e-01,  5.9534e-02,  4.5949e-01,  4.7271e-02,
          1.0773e-01, -1.3453e-01,  1.3300e-01, -3.8646e-02, -1.9102e-01,
          1.9086e-02, -3.9585e-02,  2.0024e-01, -1.2829e-01,  1.5668e-02,
          1.5647e-01,  4.0626e-02, -1.7340e-01, -5.8690e-01,  1.4192e-01,
         -1.7934e-01,  1.8025e-02,  1.3375e-01,  2.2254e-01, -2.8376e-01,
         -3.8096e-01, -4.5037e-01, -2.0124e-01, -8.5111e-02, -3.6500e-01,
          2.8822e-01,  8.2506e-03, -5.5807e-02,  9.2722e-02, -2.4299e-01,
         -5.6173e-02, -3.9782e-01,  2.4925e-01, -3.1232e-01, -1.2396e-01,
         -5.9616e-02,  7.1030e-02,  3.1310e-01,  1.0595e-01,  2.8439e-02,
          3.1939e-01,  1.0818e-01, -1.4935e-01,  1.9614e-01,  9.8527e-02,
          3.0562e-01, -1.0174e-01,  4.3365e-01,  1.9365e-01,  1.8735e-01,
         -3.8077e-01,  1.5204e-01,  2.

In [None]:
# Set the model to evaluation mode
recommendation_model.eval()

# Initialize variables for evaluation
total_loss = 0.0


recommended_urls = []
recommended_scores = []

with torch.no_grad():
    for batch_idx, batch in enumerate(tqdm(dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch')):
        # Extract embeddings
        embeddings = batch[0]

        # Forward pass
        scores = recommendation_model(embeddings)

        
        recommended_url_index = scores.argmax().item()

        
        recommended_url = data['url'].unique()[recommended_url_index]

        # Append recommended URL and its score to the lists
        recommended_urls.append(recommended_url)
        recommended_scores.append(scores[0].item())  

        # Generate random negative samples
        negative_samples = torch.randint(high=num_urls, size=scores.shape, dtype=torch.long)

        # Calculate pairwise ranking loss
        loss = nn.MarginRankingLoss(margin=1.0)(scores, negative_samples, torch.ones_like(scores))

        
        total_loss += loss.item()

average_loss = total_loss / (batch_idx + 1)
print(f'Evaluation Loss: {average_loss:.4f}')

for url, score in zip(recommended_urls, recommended_scores):
    print(f'Recommended URL: {url}, Score: {score:.4f}')
