In [3]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import torch
import re
import glob
import clip

In [4]:
clip.available_models()

['RN50',
 'RN101',
 'RN50x4',
 'RN50x16',
 'RN50x64',
 'ViT-B/32',
 'ViT-B/16',
 'ViT-L/14',
 'ViT-L/14@336px']

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
model, preprocess = clip.load("RN50", device=device)

text = clip.tokenize(["a diagram"]).to(device)

with torch.no_grad():
    text_features = model.encode_text(text)
    

In [7]:
text_features

tensor([[-0.0015,  0.4297, -0.0192,  ..., -0.1411, -0.2632,  0.1774]],
       device='cuda:0', dtype=torch.float16)

In [8]:
text_features.shape

torch.Size([1, 1024])

In [9]:
path = r'E:\competitions\kaggle\magnit_recsys-in-practice\data\jester_dataset_1_joke_texts\jokes'
ff = glob.glob(path + r'\*.html')
len(ff)

100

In [10]:
jokes = []

for filename in ff:
    with open(filename, 'rt') as file:
        rows = file.readlines()

    start_index = None
    end_index = None
    for i, row in enumerate(rows):
        if 'begin of joke' in row:
            start_index = i

        if 'end of joke' in row:
            joke = rows[start_index + 1 : i]

            jokes.append(''.join(joke)
                         .replace('<P>', '')
                         .replace('<p>', '')
                         .replace('</i>', ' ')
                         .replace('<i>', ' ')
                         .replace('<BR>', ' ')
                         .replace('&nbsp;', ' ')
                         .replace('<br>', ' ')
                         .replace('</UL>', ' ')
                         #.replace('\n', ' ')
                        )

            continue

In [11]:
jokes[0]

'A man visits the doctor. The doctor says "I have bad news for you.You have\ncancer and Alzheimer\'s disease". \nThe man replies "Well,thank God I don\'t have cancer!"\n'

In [12]:
re.split("\.", 'A man visits the doctor. The ')

['A man visits the doctor', ' The ']

In [13]:
jokes_splited = [re.split('\.|\?|\n|"', j) for j in jokes]
jokes_splited

[['A man visits the doctor',
  ' The doctor says ',
  'I have bad news for you',
  'You have',
  "cancer and Alzheimer's disease",
  '',
  ' ',
  'The man replies ',
  "Well,thank God I don't have cancer!",
  '',
  ''],
 ['Two cannibals are eating a clown, one turns to other and says: ',
  '',
  'Does this taste funny to you',
  '  ',
  ''],
 ['Q: Whats the difference between greeting a Queen and greeting the',
  'President of the United  States',
  '',
  '',
  'A: You only have to get on  one knee  to greet the queen',
  '',
  ''],
 ['Q',
  ' What do a hurricane, a tornado, and a redneck',
  'divorce all have in common',
  ' ',
  'A',
  " Someone's going to lose their trailer",
  '',
  '',
  '',
  ''],
 ['A guy stood over his tee shot for what seemed an eternity, looking up, looking down, measuring the distance,',
  'figuring the wind direction and speed',
  ' Driving his partner nuts',
  '',
  '',
  'Finally his exasperated partner says, ',
  'What the hell is taking so long',
  ' Hi

In [14]:
jjj = [[xx for xx in x if len(xx) < 78 and len(xx) > 3] for x in jokes_splited]

In [22]:
vectors = np.zeros((100, 1024))
for i, joke_list in enumerate(jjj):
    vec = []
    text = clip.tokenize(joke_list).to(device)
    text_features = model.encode_text(text)
    vectors[i] = text_features.mean(dim=0).detach().cpu().numpy()

In [23]:
vectors

array([[ 0.03105164,  0.03314209, -0.06292725, ..., -0.09759521,
         0.11785889, -0.16552734],
       [ 0.01971436, -0.07781982, -0.00900269, ...,  0.19787598,
        -0.22119141, -0.28955078],
       [-0.18530273, -0.1517334 , -0.20275879, ..., -0.12017822,
        -0.0300293 ,  0.04656982],
       ...,
       [-0.04412842,  0.07440186,  0.01824951, ..., -0.01734924,
        -0.0682373 ,  0.07849121],
       [-0.11895752,  0.02070618,  0.0692749 , ..., -0.02378845,
        -0.26806641, -0.10827637],
       [ 0.0065918 ,  0.21984863,  0.08905029, ...,  0.02030945,
         0.2364502 , -0.01322174]])

In [25]:
joke_df = pd.DataFrame(vectors, index=np.arange(1, 101), columns=[f'joke_feature_{i}' for i in np.arange(1, 1025)])
joke_df

Unnamed: 0,joke_feature_1,joke_feature_2,joke_feature_3,joke_feature_4,joke_feature_5,joke_feature_6,joke_feature_7,joke_feature_8,joke_feature_9,joke_feature_10,...,joke_feature_1015,joke_feature_1016,joke_feature_1017,joke_feature_1018,joke_feature_1019,joke_feature_1020,joke_feature_1021,joke_feature_1022,joke_feature_1023,joke_feature_1024
1,0.031052,0.033142,-0.062927,0.001827,-0.069397,0.123657,-0.024490,0.481445,0.014511,0.385742,...,0.020935,-0.134521,0.096436,-0.000384,-0.001071,0.010735,0.130371,-0.097595,0.117859,-0.165527
2,0.019714,-0.077820,-0.009003,0.004494,-0.200439,0.022064,-0.172241,0.156494,0.342773,0.535645,...,0.020630,-0.012466,0.226318,0.251465,0.000111,-0.013855,-0.059692,0.197876,-0.221191,-0.289551
3,-0.185303,-0.151733,-0.202759,0.001920,-0.098267,0.104614,-0.105408,0.033905,-0.152466,0.369873,...,-0.109802,-0.104004,0.071106,-0.091553,-0.002798,0.140747,0.096985,-0.120178,-0.030029,0.046570
4,-0.025711,0.190796,-0.258301,-0.000157,0.060974,0.110229,0.083435,0.231323,0.117981,0.212036,...,0.001912,-0.245361,-0.187256,0.283936,0.000828,0.159180,0.045380,-0.074829,-0.180054,-0.139038
5,0.097961,0.133179,0.030548,-0.000371,0.009476,-0.161133,0.164185,0.279053,0.012589,0.288818,...,0.081665,0.017975,-0.038208,0.101379,-0.002028,-0.142822,0.081360,0.052551,0.016586,0.026123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,-0.044159,-0.155273,-0.007950,0.005970,-0.259277,0.087158,0.019363,0.454346,0.176636,0.286621,...,-0.014793,-0.420898,-0.092712,0.063293,-0.001143,-0.095703,0.110107,0.151367,0.088074,0.049377
97,-0.049957,0.015289,0.134888,0.003126,-0.018814,-0.073181,-0.150146,0.029053,0.182983,0.278809,...,-0.064697,-0.102966,-0.078735,0.043182,-0.002499,-0.022293,0.076721,0.062622,0.050507,-0.098389
98,-0.044128,0.074402,0.018250,0.002848,0.047882,-0.030807,0.199829,0.459717,0.079712,0.290283,...,-0.036774,-0.217529,0.041138,0.084045,-0.003759,0.060028,-0.001889,-0.017349,-0.068237,0.078491
99,-0.118958,0.020706,0.069275,0.000482,-0.101624,-0.027802,0.229248,0.315918,0.085938,0.330566,...,0.030960,-0.165527,-0.009705,-0.070251,-0.000473,0.073547,0.042023,-0.023788,-0.268066,-0.108276


In [26]:
joke_df.to_csv('joke_clip_vectors.csv', index=False)