In [None]:
import logging
import os
import shutil 

import numpy as np
from google.colab import drive

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)  # гарантируем воспроизводимость

logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info('Инициализировали логгер')

ROOT_DIR = '/content/drive' 
drive.mount(ROOT_DIR)
logger.info('Подключили диск')

root_data_dir = os.path.join(ROOT_DIR, 'MyDrive', 'hse_nlp_2021')
if not os.path.exists(root_data_dir):
  raise RuntimeError('Отсутствует директория с данными')
else:
  logger.info('Содержимое директории %s: %s', root_data_dir, os.listdir(root_data_dir))

Включаем GPU

Подробнее о модели [тут](https://pytorch.org/hub/pytorch_vision_resnet/)

![gpu_on_colab](img/gpu_on_colab_gui.png)

In [None]:
from torchvision.models import resnet18

TORCH_MODELS_DIR = os.path.join(ROOT_DIR, 'MyDrive', 'hse_nlp_2021', 'torch_models')
try:
  os.mkdir(TORCH_MODELS_DIR)
except FileExistsError as e:
  logger.info(e)

os.environ['TORCH_HOME'] = TORCH_MODELS_DIR # TORCH_MODEL_ZOO is deprecated
rn18 = resnet18(pretrained=True)

# запускаем вычисления на GPU
rn18 = rn18.to('cuda:0')
logger.info('Модель загружена')

# Эмбеддинги картинок

Для начала посмотрим, какие слои есть в сети

Кроме `.modules` можно было воспользоваться `.named_children()`

In [None]:
for layer in rn18._modules:
  print(layer)

Нам нужен слой `layer4`

Каждый слой это по сути массив c весами модели - нам нужно оставить все слои ДО того слоя, который нас интересует

In [None]:
from torch import nn

class FeatureExtractor(nn.Module):
    def __init__(self, output_layer, torch_model):
        super().__init__()
        self.output_layer = output_layer
        self.pretrained = torch_model
        self.children_list = []
        for n,c in self.pretrained.named_children():
            self.children_list.append(c)
            if n == self.output_layer:
                logger.info('final layer archived: %s', output_layer)
                break

        self.net = nn.Sequential(*self.children_list)
        self.pretrained = None
        
    def forward(self,x):
        x = self.net(x)
        return x

Создаём объект-экстрактор с выборанным слоем

In [None]:
resnet_extractor = FeatureExtractor(output_layer='avgpool', torch_model=rn18)


Проверяем директорию с картинками

In [None]:
ROOT_MEMES_DIR = os.path.join(ROOT_DIR, 'MyDrive', 'hse_nlp_2021', 'memes')

os.listdir(ROOT_MEMES_DIR)[:10]

In [None]:
from PIL import Image
from torchvision import transforms

filename = os.path.join(ROOT_MEMES_DIR, '7f3ywc')

input_image = Image.open(filename)

In [None]:
type(input_image)

In [None]:
from torch import no_grad, cuda
import torchvision.transforms as transforms

# transform = transforms.ToTensor()

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
image_tensor = preprocess(input_image)
input_batch = image_tensor.unsqueeze(0) # create a mini-batch as expected by the model

# move the input and model to GPU for speed if available
if cuda.is_available():
    input_batch = input_batch.to('cuda')
    resnet_extractor.to('cuda')

with no_grad():
    output = resnet_extractor(input_batch)
    numpy_vector = output.reshape(-1).cpu().numpy()  # flatten(output)
    print(type(output), output.size())

In [None]:
import torchvision.transforms as transforms
from PIL.JpegImagePlugin import JpegImageFile
from torch import no_grad, cuda
from PIL import Image, UnidentifiedImageError

def img2embedding(input_meme_filename: str) -> np.array:
  OUTPUT_SHAPE = 512
  numpy_vector = np.zeros(OUTPUT_SHAPE)
  preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
  ])

  try:
    input_img = Image.open(input_meme_filename) # type: JpegImageFile
  except UnidentifiedImageError:
    return numpy_vector
  try:
    image_tensor = preprocess(input_img)
  except RuntimeError:
    #  logger.info('error with %s meme', meme_filename.split('/')[-1])
    return numpy_vector
  input_batch = image_tensor.unsqueeze(0) # create a mini-batch as expected by the model

  # move the input and model to GPU for speed if available
  if cuda.is_available():
      input_batch = input_batch.to('cuda')
      resnet_extractor.to('cuda')

  with no_grad():
      output = resnet_extractor(input_batch)
      numpy_vector = output.reshape(-1).cpu().numpy()  # flatten(output)
  return numpy_vector


if os.path.exists(os.path.join(ROOT_MEMES_DIR, 'embed.npy')):
  embeds_matrix = np.load(os.path.join(ROOT_MEMES_DIR, 'embed.npy'))
  with open(os.path.join(ROOT_MEMES_DIR, 'file_index.pkl'), 'rb') as f:
    file_index = pickle.load(f)
    logger.info('files loaded from dump')
else:
  res = []  # тут основная информация о контенте
  file_index = {}
  TOP = 3092
  error_files = []
  logger.info('Processing started')
  dense_index = 0
  for f_name in os.listdir(ROOT_MEMES_DIR)[:TOP]:
    meme_filename = os.path.join(ROOT_MEMES_DIR, f_name)
    img_embed = img2embedding(meme_filename)
    if img_embed.sum() == 0:
      error_files.append(meme_filename)
    # сохраняяем эмбеддинг (их потом схлопнем в матрицу) и отдельно индекс файла в матрице
    res.append(img_embed)
    file_index[dense_index] = {'f_name': f_name}
    dense_index += 1
    
  if len(error_files) > 0:
    logger.info('num errors %d', len(error_files))
    for i in error_files:
      os.remove(i)

  embeds_matrix = np.vstack(res)

logger.info(embeds_matrix.shape)

In [None]:
import numpy as np
import pickle

np.save(os.path.join(ROOT_MEMES_DIR, 'embed.npy') , embeds_matrix)
with open(os.path.join(ROOT_MEMES_DIR, 'file_index.pkl'), 'wb') as f:
  pickle.dump(file_index, f)

Устанавливаем umap 

пример из [официальной документации](https://umap-learn.readthedocs.io/en/latest/basic_usage.html)

In [None]:
!pip install umap-learn==0.5.2

In [None]:
import umap
from sklearn.preprocessing import StandardScaler

reducer = umap.UMAP()

scaled_memes_data = StandardScaler().fit_transform(embeds_matrix)
low_rank_matrix = reducer.fit_transform(scaled_memes_data)
logger.info('low rank matrix shape %s', low_rank_matrix.shape)

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt

plt.scatter(
    low_rank_matrix[:, 0],
    low_rank_matrix[:, 1],
)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the memes dataset', fontsize=24)
plt.show()

Получился один "кластер"

Выполняем кластеризацию в низкоразмерном пространстве с помощью DBScan чтобы выделить метки кластеров

In [None]:
from sklearn.cluster import DBSCAN

clstr = DBSCAN(eps=0.10, min_samples=4)
classes = clstr.fit_predict(low_rank_matrix)
logger.info('num classes %s', np.unique(classes).size)

plt.scatter(
    low_rank_matrix[:, 0],
    low_rank_matrix[:, 1],
    c=classes,
    cmap='rainbow',
    alpha=0.7,
)
plt.gca().set_aspect('equal', 'datalim')
plt.title('UMAP projection of the memes dataset', fontsize=24)
plt.show()

In [None]:
import pandas as pd
import os

clusters_filename = os.path.join(ROOT_MEMES_DIR, 'dbscan_clusters.csv')
if os.path.exists(clusters_filename):
  memes_df = pd.read_csv(clusters_filename)
  logger.info('loaded_from %s', clusters_filename)
else:
  df_rows = []
  for meme_index in range(low_rank_matrix.shape[0]):
    df_rows.append((file_index[meme_index]['f_name'], classes[meme_index]))
  memes_df = pd.DataFrame(df_rows, columns=['f_name', 'dbscan_cluster'])
logger.info('%s', memes_df['dbscan_cluster'].value_counts().head(10).to_dict())
memes_df.head(5)

In [None]:
memes_df.to_csv(os.path.join(ROOT_MEMES_DIR, 'dbscan_clusters.csv'), index=False)

Видно, что есть кластер с индексом `0` где большая часть контента и меньшие по можности кластера. ДЛя сравнения визуализируем кластер c индексом `3` и кластер с индексом `4`

In [None]:
import shutil

for f_name in os.listdir(ROOT_MEMES_DIR):
  if f_name not in ('file_index.pkl', 'embed.npy'):
    os.rename(os.path.join(ROOT_MEMES_DIR, f_name), os.path.join(ROOT_MEMES_DIR, f_name+'.jpeg'))

In [None]:
from IPython.display import Image as NotebookImage
from IPython.display import display

def visualise_cluster(cluster_id: int, top=10):
  for _, row in memes_df.query(f'dbscan_cluster == {cluster_id}').head(top).iterrows():
    tmp_file_path = os.path.join(ROOT_MEMES_DIR, row['f_name']+'.jpeg')
    pil_img = NotebookImage(filename=tmp_file_path, width=200)
    display(pil_img)

visualise_cluster(cluster_id=14)

In [None]:
visualise_cluster(cluster_id=16)

In [None]:
visualise_cluster(cluster_id=5)