In [1]:
!pip install torch transformers













In [2]:
import pandas as pd

In [3]:
import torch
from torch import nn
from transformers import DistilBertTokenizer, DistilBertModel

class TextLikesPredictionModel(nn.Module):
    def __init__(self, pretrained_model_name='distilbert-base-uncased'):
        super(TextLikesPredictionModel, self).__init__()
        self.bert = DistilBertModel.from_pretrained(pretrained_model_name)
        self.regression_head = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 180),
            nn.ReLU(),
            nn.Linear(180, 1)  # Regression output
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = outputs.last_hidden_state[:, 0, :]  # CLS token embedding
        logits = self.regression_head(hidden_state)
        return logits


In [4]:
from transformers import DistilBertTokenizer

# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text data
def preprocess_text(texts, max_length=180):
    encoding = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    return encoding['input_ids'], encoding['attention_mask']

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
from torch.utils.data import Dataset, DataLoader

class TextLikesDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=128):
        self.data = pd.read_csv(csv_file)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = self.data.iloc[idx]['formatted_text']
        target = self.data.iloc[idx]['likes']
        
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        input_ids = encoding['input_ids'].squeeze()
        attention_mask = encoding['attention_mask'].squeeze()
        
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target': torch.tensor(target, dtype=torch.float)
        }

# Example DataLoader
train_dataset = TextLikesDataset(csv_file='/kaggle/input/corpus2/pre_proc_train.csv', tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Initialize model and optimizer
model = TextLikesPredictionModel()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
criterion = nn.MSELoss()

# Training loop
for epoch in range(5):  # Number of epochs
    model.train()
    total_loss = 0
    
    for batch in train_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        targets = batch['target']
        
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs.squeeze(), targets)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    print(f'Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}')


# Finding the max length of tokenized text


In [None]:

from transformers import DistilBertTokenizer

# Initialize the tokenizer (you can replace 'distilbert-base-uncased' with any other tokenizer)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Load your dataset (assuming it's a CSV with a 'content' column for text data)
df = pd.read_csv('/kaggle/input/corpus2/pre_proc_train.csv')

# Function to calculate token lengths
def get_max_token_length(texts):
    max_length = 0
    all_lengths = []
    
    for text in texts:
        encoding = tokenizer(text, truncation=False, return_tensors='pt')  # No truncation to get actual length
        length = encoding['input_ids'].shape[1]  # Get the number of tokens
        all_lengths.append(length)
        
        if length > max_length:
            max_length = length
            
    return max_length, all_lengths

# Apply function to your 'content' column
max_length, all_lengths = get_max_token_length(df['formatted_text'].tolist())

print(f"Maximum number of tokens in the dataset: {max_length}")


In [None]:
import matplotlib.pyplot as plt

plt.hist(all_lengths, bins=50)
plt.xlabel('Number of Tokens')
plt.ylabel('Frequency')
plt.title('Token Length Distribution')
plt.show()