In [1]:
import pandas as pd
from torch.utils.data import DataLoader

In [2]:
df = pd.read_csv('/kaggle/input/text-final/post_text_df.csv', index_col='Unnamed: 0')

In [3]:
df.head()

Unnamed: 0,post_id,text,topic
0,1,UK economy facing major risks\n\nThe UK manufa...,business
1,2,Aids and climate top Davos agenda\n\nClimate c...,business
2,3,Asian quake hits European shares\n\nShares in ...,business
3,4,India power shares jump on debut\n\nShares in ...,business
4,5,Lacroix label bought by US firm\n\nLuxury good...,business


In [4]:
### Сделаем эмбеддинги постов

from transformers import DistilBertModel  # https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel


def get_model_distilbert():
    return AutoTokenizer.from_pretrained(checkpoint_names['distilbert-base-cased']), 
                                         model_classes[DistilBertModel].from_pretrained(checkpoint_names['distilbert-base-cased'])

In [5]:
tokenizer, model = get_model_distilbert()

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
### Сделаем датасет для постов

from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding


class PostDataset(Dataset):
    def __init__(self, texts, tokenizer):
        super().__init__()

        self.texts = tokenizer.batch_encode_plus(
            texts,
            add_special_tokens=True,
            return_token_type_ids=False,
            return_tensors='pt',
            truncation=True,
            padding=True
        )
        self.tokenizer = tokenizer

    def __getitem__(self, idx):
        return {'input_ids': self.texts['input_ids'][idx], 'attention_mask': self.texts['attention_mask'][idx]}

    def __len__(self):
        return len(self.texts['input_ids'])
    
    
dataset = PostDataset(df['text'].values.tolist(), tokenizer)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

loader = DataLoader(dataset, batch_size=32, collate_fn=data_collator, pin_memory=True, shuffle=False)

b = next(iter(loader))

b

{'input_ids': tensor([[  101,  1993,  4190,  ...,     0,     0,     0],
        [  101, 15386,  1116,  ...,  1300,  1107,   102],
        [  101,  3141,   186,  ..., 14099,  8478,   102],
        ...,
        [  101, 16972, 20647,  ...,     0,     0,     0],
        [  101,   137,   188,  ...,     0,     0,     0],
        [  101,   144, 22731,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [7]:
import torch
from tqdm import tqdm


@torch.inference_mode()
def get_embeddings_labels(model, loader):
    model.eval()
    
    total_embeddings = []
    
    for batch in tqdm(loader):
        batch = {key: batch[key].to(device) for key in ['attention_mask', 'input_ids']}

        embeddings = model(**batch)['last_hidden_state'][:, 0, :]

        total_embeddings.append(embeddings.cpu())

    return torch.cat(total_embeddings, dim=0)

In [8]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

print(device)
print(torch.cuda.get_device_name())

model = model.to(device)

cuda:0
Tesla T4


In [9]:
embeddings = get_embeddings_labels(model, loader).numpy()

embeddings

100%|██████████| 220/220 [02:03<00:00,  1.79it/s]


array([[ 3.63150865e-01,  4.89376076e-02, -2.64081180e-01, ...,
        -1.41593322e-01,  1.59181338e-02,  9.17690195e-05],
       [ 2.36416355e-01, -1.59500718e-01, -3.27798098e-01, ...,
        -2.89936095e-01,  1.19365320e-01, -1.62343075e-03],
       [ 3.75191331e-01, -1.13944076e-01, -2.40547031e-01, ...,
        -3.38919759e-01,  5.86940572e-02, -2.12656837e-02],
       ...,
       [ 3.40382695e-01,  6.64923266e-02, -1.63184404e-01, ...,
        -8.65628570e-02,  2.03403920e-01,  3.20906118e-02],
       [ 4.32092220e-01,  1.10915992e-02, -1.17306016e-01, ...,
         7.54013509e-02,  1.02739379e-01,  1.52743552e-02],
       [ 3.04277390e-01, -7.62156919e-02, -6.77587613e-02, ...,
        -5.43488450e-02,  2.44383574e-01, -1.41483713e-02]], dtype=float32)

In [10]:
### Кластеризуем тексты

from sklearn.decomposition import PCA

centered = embeddings - embeddings.mean()

pca = PCA(n_components=20)
pca_decomp = pca.fit_transform(centered)

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=15, random_state=0).fit(pca_decomp)

df['TextCluster'] = kmeans.labels_

dists_columns = ['DistanceTo1thCluster',
                 'DistanceTo2thCluster',
                 'DistanceTo3thCluster',
                 'DistanceTo4thCluster',
                 'DistanceTo5thCluster',
                 'DistanceTo6thCluster',
                 'DistanceTo7thCluster',
                 'DistanceTo8thCluster',
                 'DistanceTo9thCluster',
                 'DistanceTo10thCluster',
                 'DistanceTo11thCluster',
                 'DistanceTo12thCluster',
                 'DistanceTo13thCluster',
                 'DistanceTo14thCluster',
                 'DistanceTo15thCluster']

dists_df = pd.DataFrame(
    data=kmeans.transform(pca_decomp),
    columns=dists_columns
)

dists_df.head()

Unnamed: 0,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster
0,3.549044,3.294883,3.433284,2.672924,3.270944,1.96934,1.554703,2.116606,3.231525,3.299225,3.492655,2.803745,3.362657,3.293774,3.345579
1,3.260293,3.184324,3.201337,2.345156,3.233457,1.943692,1.427023,2.07467,3.188464,2.772893,3.160858,2.601588,3.09643,3.033695,2.978372
2,3.312693,3.237956,3.369599,2.731229,3.390497,2.831895,1.355913,2.183056,3.123334,2.775962,3.131239,2.874568,3.287189,3.129553,3.007674
3,3.038347,3.618477,3.462424,3.229656,3.616267,3.20552,2.125577,2.58446,3.383063,3.560984,3.706457,3.126458,3.934969,3.554933,3.667043
4,2.918062,2.587985,2.868014,1.804202,2.570554,2.674797,1.346376,1.54151,2.832907,2.41916,2.773459,2.392756,3.013143,2.643341,2.541931


In [12]:
posts_info = pd.concat((df, dists_df), axis=1)

posts_info.head()

Unnamed: 0,post_id,text,topic,TextCluster,DistanceTo1thCluster,DistanceTo2thCluster,DistanceTo3thCluster,DistanceTo4thCluster,DistanceTo5thCluster,DistanceTo6thCluster,DistanceTo7thCluster,DistanceTo8thCluster,DistanceTo9thCluster,DistanceTo10thCluster,DistanceTo11thCluster,DistanceTo12thCluster,DistanceTo13thCluster,DistanceTo14thCluster,DistanceTo15thCluster
0,1,UK economy facing major risks\n\nThe UK manufa...,business,6,3.549044,3.294883,3.433284,2.672924,3.270944,1.96934,1.554703,2.116606,3.231525,3.299225,3.492655,2.803745,3.362657,3.293774,3.345579
1,2,Aids and climate top Davos agenda\n\nClimate c...,business,6,3.260293,3.184324,3.201337,2.345156,3.233457,1.943692,1.427023,2.07467,3.188464,2.772893,3.160858,2.601588,3.09643,3.033695,2.978372
2,3,Asian quake hits European shares\n\nShares in ...,business,6,3.312693,3.237956,3.369599,2.731229,3.390497,2.831895,1.355913,2.183056,3.123334,2.775962,3.131239,2.874568,3.287189,3.129553,3.007674
3,4,India power shares jump on debut\n\nShares in ...,business,6,3.038347,3.618477,3.462424,3.229656,3.616267,3.20552,2.125577,2.58446,3.383063,3.560984,3.706457,3.126458,3.934969,3.554933,3.667043
4,5,Lacroix label bought by US firm\n\nLuxury good...,business,6,2.918062,2.587985,2.868014,1.804202,2.570554,2.674797,1.346376,1.54151,2.832907,2.41916,2.773459,2.392756,3.013143,2.643341,2.541931


In [13]:
posts_info.to_csv('post_text_df.csv')