In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = TFBertModel.from_pretrained("bert-base-uncased")


Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [2]:
df = pd.read_csv('endf1201.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,videoid,publish_time,channelId,title,cover_url,channelTitle,tag,duration,caption,...,country_UA,country_UG,country_US,country_UY,country_UZ,country_VG,country_VN,country_YE,country_ZA,country_ZW
0,0,jNQXAC9IVRw,2005-04-24T03:31:52Z,UC4QobU6STFB0P71PMvOGN5A,Me at the zoo,https://i.ytimg.com/vi/jNQXAC9IVRw/hqdefault.jpg,jawed,"me at the zoo,jawed karim,first youtube video",PT19S,True,...,0,0,1,0,0,0,0,0,0,0
1,1,1raUvGNbZFg,2006-10-20T15:18:57Z,UC0M0rxSz3IF0CsSour1iWmw,Friday the 13th (NES) - Angry Video Game Nerd ...,https://i.ytimg.com/vi/1raUvGNbZFg/hqdefault.jpg,Cinemassacre,Angry Video Game Nerd AVGN AVGN 12 AVGN Friday...,PT12M22S,True,...,0,0,1,0,0,0,0,0,0,0
2,2,jFd-6EPfnec,2006-11-18T00:53:40Z,UC4a-Gbdw7vOaccHmFo40b9g,Greatest common factor explained | Factors and...,https://i.ytimg.com/vi/jFd-6EPfnec/hqdefault.jpg,Khan Academy,,PT6M20S,True,...,0,0,1,0,0,0,0,0,0,0
3,3,dj6udHjEbQw,2006-11-21T16:27:16Z,UC310aJFjr6Gn9mGZjMZ2VTQ,The Drink,https://i.ytimg.com/vi/dj6udHjEbQw/hqdefault.jpg,Tripp and Tyler,"drink,dontbethatguy,dont,be,that,guy,funny,hil...",PT1M,True,...,0,0,1,0,0,0,0,0,0,0
4,4,VIPXMXP0clU,2006-12-16T10:50:21Z,UC310aJFjr6Gn9mGZjMZ2VTQ,Practical Gifts (with Jeff Foxworthy),https://i.ytimg.com/vi/VIPXMXP0clU/hqdefault.jpg,Tripp and Tyler,"Jeff,Foxworthy,Dont,Be,That,Guy,dontbethatguy,...",PT1M17S,True,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60570,60570,qFq0v9fBbn4,2022-11-20T02:45:07Z,UCW7HlG1SaKEidJakXeufXUg,How Shen Yun Dancers are perfectly synchronize...,https://i.ytimg.com/vi/qFq0v9fBbn4/hqdefault.jpg,3 Musketeers,"Shen Yun,shenyun,shenyun creation,shen yun 202...",PT5M16S,True,...,0,0,1,0,0,0,0,0,0,0
60571,60571,JMAmGgWXn0Y,2022-11-20T03:00:14Z,UCbKHjNv3DBqANFpp2xMLnxQ,Trying TELOR GULUNG & PENTOL in Surabaya (Indo...,https://i.ytimg.com/vi/JMAmGgWXn0Y/hqdefault.jpg,Wild CARLOS appeared!,"indonesian street food,kuliner surabaya,kuline...",PT59M23S,True,...,0,0,1,0,0,0,0,0,0,0
60572,60572,hU8dIxGCKKg,2022-11-20T04:00:10Z,UC2wZJuuD_WA9ps2FORco1EQ,Ep.11 🇯🇵 볶음밥 9.4kg을 먹는다는 일본 대식가 쌍둥이 유튜버와 야키니쿠 ...,https://i.ytimg.com/vi/hU8dIxGCKKg/hqdefault.jpg,까니짱 [ G-NI : 蟹ちゃん],"ASMR,Korean,food,mukbang,eating show,daily lif...",PT33M44S,True,...,0,0,0,0,0,0,0,0,0,0
60573,60573,GkEhZu-74Ic,2022-11-20T07:20:33Z,UCZrLBbZ0qBhEOou3lbK0msA,🔴 Five Signs A Woman Is Emotionally Damaged (A...,https://i.ytimg.com/vi/GkEhZu-74Ic/hqdefault.jpg,Fredo Hill,"fredo hill,derek rake,shogun method,fractionat...",PT7M2S,True,...,0,0,1,0,0,0,0,0,0,0


# get embeddings for title and cc seperately

In [4]:
# for title
tokens_test_title = tokenizer(
    df['title'].tolist(),
    max_length = 22,
    pad_to_max_length=True,
    truncation=True,return_tensors='tf',
    return_token_type_ids=False,
    return_attention_mask=True
)



In [5]:
tokens_test_title

{'input_ids': <tf.Tensor: shape=(60575, 22), dtype=int32, numpy=
array([[  101,  2033,  2012, ...,     0,     0,     0],
       [  101,  5958,  1996, ...,     0,     0,     0],
       [  101,  4602,  2691, ...,     0,     0,     0],
       ...,
       [  101,  4958,  1012, ..., 30021, 29993,   102],
       [  101,   100,  2274, ...,     0,     0,     0],
       [  101,  5765,  2934, ...,  4682, 23991,   102]])>, 'attention_mask': <tf.Tensor: shape=(60575, 22), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1]])>}

In [None]:
# for cc (10trunks)

In [6]:
def tokenize_longtext(train_text):
    trunk={'input_ids_0':[],'attention_mask_0':[],\
           'input_ids_1':[],'attention_mask_1':[],\
           'input_ids_2':[],'attention_mask_2':[],\
           'input_ids_3':[],'attention_mask_3':[],\
           'input_ids_4':[],'attention_mask_4':[],\
           'input_ids_5':[],'attention_mask_5':[],\
           'input_ids_6':[],'attention_mask_6':[],\
           'input_ids_7':[],'attention_mask_7':[],\
           'input_ids_8':[],'attention_mask_8':[],\
           'input_ids_9':[],'attention_mask_9':[]}
    fail=[]
    df = train_text.reset_index(drop=True)
    for i in range(len(df)):
        videoid = df['videoid'].iloc[i]
        try:
            with open(f'./cc_txt/{videoid}.txt',encoding="utf8") as f:
                line = f.read()
            seqlen = len(line.split())
            limit = int(np.ceil(seqlen/512))
            for j in range(10):
                temp = line.split()[512*j:min(512*(j+1),seqlen)]
                templine = ' '.join(x for x in temp)
                q = tokenizer(templine,max_length = 512,
                                pad_to_max_length=True,
                                truncation=True,
                                return_tensors='tf',
                                return_token_type_ids=False,
                                return_attention_mask=True)
                trunk[f'input_ids_{j}'].append(q['input_ids'])
                trunk[f'attention_mask_{j}'].append(q['attention_mask'])
            if i%100==0:
                print(i)
        except:
            print(i)
            fail.append(i)

    return trunk,fail


In [7]:
trunk1 = tokenize_longtext(df)

0
100
200
300
400
500
600
700
800
900
1000
1002
1100
1185
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2320
2400
2404
2432
2500
2600
2700
2800
2900
3000
3100
3200
3244
3300
3314
3400
3500
3600
3700
3800
3900
4000
4100
4165
4200
4300
4333
4397
4400
4500
4600
4700
4800
4900
4923
5000
5045
5100
5114
5200
5300
5400
5420
5500
5504
5600
5700
5800
5900
6000
6100
6200
6300
6400
6444
6500
6600
6700
6800
6900
6920
7000
7073
7100
7200
7300
7400
7419
7500
7600
7700
7798
7800
7900
7943
8000
8100
8135
8200
8300
8400
8500
8600
8700
8800
8900
8943
9000
9010
9100
9187
9200
9284
9300
9316
9378
9400
9500
9600
9689
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11807
11900
11915
12000
12100
12200
12300
12365
12400
12500
12585
12600
12700
12800
12893
12900
13000
13004
13023
13100
13200
13300
13400
13500
13600
13700
13718
13800
13900
14000
14100
14200
14300
14400
14423
14500
14600
14700
14800
14900
14936
15000
1

In [8]:
newdf = df.reset_index(drop=True)
newdf = newdf.drop(index=trunk1[1])
newdf = newdf.reset_index(drop=True)

In [10]:
#get pooled embeddings outside the model
embeddings = np.zeros((len(newdf),768))
for j in range(10):
    print(j)
    for i in range(len(newdf)):
        embeddings_cc = model(trunk1[0][f'input_ids_{j}'][i], attention_mask = trunk1[0][f'attention_mask_{j}'][i])['pooler_output']
        embeddings[i]=embeddings[i]+np.array(embeddings_cc).reshape(-1)
        if i%100==0:
            print(i)

0
0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600
9700
9800
9900
10000
10100
10200
10300
10400
10500
10600
10700
10800
10900
11000
11100
11200
11300
11400
11500
11600
11700
11800
11900
12000
12100
12200
12300
12400
12500
12600
12700
12800
12900
13000
13100
13200
13300
13400
13500
13600
13700
13800
13900
14000
14100
14200
14300
14400
14500
14600
14700
14800
14900
15000
15100
15200
15300
15400
15500
15600
15700
15800
15900
16000
16100
16200
16300
16400
16500
16600
16700
16800
16900
17000
17100
17200
17300
17400
17500
17600
17700
17800
17900
18000
18100
18200
18300
18400


In [11]:
with open('all_embeddings.npy', 'wb') as f:
    np.save(f, embeddings)

In [13]:
avgem =embeddings/10

In [17]:
avgem.shape

(60256, 768)

In [19]:
trunk1[1]

[1002,
 1185,
 2320,
 2404,
 2432,
 3244,
 3314,
 4165,
 4333,
 4397,
 4923,
 5045,
 5114,
 5420,
 5504,
 6444,
 6920,
 7073,
 7419,
 7798,
 7943,
 8135,
 8943,
 9010,
 9187,
 9284,
 9316,
 9378,
 9689,
 11807,
 11915,
 12365,
 12585,
 12893,
 13004,
 13023,
 13718,
 14423,
 14936,
 15182,
 15639,
 15766,
 15797,
 16134,
 16136,
 16171,
 16592,
 16950,
 16951,
 17278,
 17387,
 17530,
 17595,
 17985,
 18374,
 18410,
 18413,
 18428,
 18756,
 18817,
 19790,
 20131,
 20645,
 21318,
 21694,
 21841,
 21938,
 21970,
 22045,
 22067,
 22089,
 22112,
 22500,
 22501,
 22804,
 22848,
 23117,
 23624,
 24476,
 25171,
 25874,
 25898,
 26037,
 26065,
 26143,
 26196,
 26260,
 26540,
 27027,
 27780,
 27802,
 27866,
 28255,
 28272,
 28306,
 28439,
 28472,
 28522,
 28736,
 28809,
 29018,
 29106,
 29150,
 29234,
 29266,
 29535,
 29695,
 29750,
 29772,
 29806,
 29877,
 29998,
 29999,
 30646,
 31033,
 31034,
 31045,
 31288,
 32104,
 32279,
 32333,
 32364,
 32365,
 32445,
 32496,
 32576,
 32668,
 32749,
 3299

In [30]:
newdf.to_csv('df1203.csv')

In [27]:
#get cosine similarity for all
from numpy.linalg import norm
cosine_list=[]
for i in range(len(newdf)):
    line_output =avgem[i]

    title_input = tokenizer(newdf['title'].iloc[i], return_tensors="tf")
    title_output = model(title_input).pooler_output

    line_output = np.array(line_output).reshape(-1)
    title_output = np.array(title_output).reshape(-1)

    cosine = np.dot(line_output,title_output)/(norm(line_output)*norm(title_output))
    cosine_list.append(cosine)

    if i%100==0:
        print(i)
        print(cosine)

0
0.8174771474113535
100
0.9534949631904904
200
0.9118096933781304
300
0.9201808792222863
400
0.9175833441402792
500
0.9568845221490575
600
0.8700324207888681
700
0.8976667305064453
800
0.9298389899862636
900
0.9068281860201153
1000
0.9457610478591039
1100
0.9200540963858687
1200
0.8759019282105158
1300
0.8430740047869228
1400
0.8627307339363726
1500
0.8910641094979247
1600
0.9561578217473946
1700
0.92295589264614
1800
0.8601897305412598
1900
0.8961393803506221
2000
0.862447893965349
2100
0.9162466432216155
2200
0.8807786048641206
2300
0.8529756386764873
2400
0.8807527760485582
2500
0.8621370983881917
2600
0.945338264728599
2700
0.8905910664602555
2800
0.9117909154063323
2900
0.93699253862952
3000
0.89424020788138
3100
0.9418114942564941
3200
0.8781500159805644
3300
0.8727252207252927
3400
0.8617660017111534
3500
0.9016530511860551
3600
0.9262663693291849
3700
0.8674210655005967
3800
0.9262288833308897
3900
0.8995885231598504
4000
0.8305958339762657
4100
0.8757815580603594
4200
0.87389

In [28]:
newdf['cosine_sim']=cosine_list