# 1. Import Packages and Libraries

In [1]:
import pandas as pd
import numpy as np
import pickle
import requests

# 2. Read in Data

In [2]:
data = pd.read_csv('../GenerativeAI tweets.csv')
data['Text'] = data['Text'].str.lower().astype(str)
data

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username
0,0,2023-04-19 21:27:19+00:00,1648800467206672384,from studio gangster to synthetic gangster 🎤.....,resembleai
1,1,2023-04-19 21:27:09+00:00,1648800425540476929,took me some time to find this. i build this #...,devaanparbhoo
2,2,2023-04-19 21:26:57+00:00,1648800376479715328,mind blowing next wave #generativeai platform...,timreha
3,3,2023-04-19 21:26:49+00:00,1648800341193027584,open source generative ai image specialist sta...,VirtReview
4,4,2023-04-19 21:25:00+00:00,1648799883934203905,are you an #hr leader considering which future...,FrozeElle
...,...,...,...,...,...
56216,56216,2022-04-24 16:40:01+00:00,1518268535276904448,"understanding generative ai, its impacts and l...",analyticsinme
56217,56217,2022-04-23 07:23:24+00:00,1517766068592381952,y ya puedes empezar a crear #arte con @thegeni...,iia_es
56218,56218,2022-04-22 08:20:21+00:00,1517418013812830208,"nvidia researchers have developed ganverse3d, ...",VideoGenAI
56219,56219,2022-04-21 13:15:21+00:00,1517129866403008512,tech trend 2022: เทรนด์เทคโนโลยีสำหรับปี 2022 ...,sitthinuntp


# 3. Generate Embeddings from Pretrained Model

In [4]:
model_id = "sentence-transformers/all-MiniLM-L6-v2"
hf_token = "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"
api_url = f"https://api-inference.huggingface.co/pipeline/feature-extraction/{model_id}"
headers = {"Authorization": f"Bearer {hf_token}"}

In [7]:
def getEmbeddings(spec):
    
    import requests
    import pandas as pd
    import numpy as np
    
    model_id = spec['model_id']
    hf_token = spec['hf_token']
    api_url = spec['api_url']
    headers = spec['headers']
    
    def query(texts):
        response = requests.post(api_url, headers=headers, json={"inputs": texts, "options":{"wait_for_model":True}})
        return response.json()
    
    return np.array(query(list(spec['data'].iloc[spec['start']:spec['end']]['Text'])))

#### Test

In [8]:
getEmbeddings({'data':data,'start':1000,'end':2000,'model_id':model_id,'hf_token':hf_token,
              'api_url':api_url,'headers':headers}).shape

(1000, 384)

#### Embed Full Dataset

In [9]:
start = 0
chunk_size = 1000
end = len(data)

embeddings = []

while start < end:
    spec = {'data':data,'start':start,'end':min(end,start+chunk_size),
                          'model_id':model_id,'hf_token':hf_token,
                           'api_url':api_url,'headers':headers}
    
    embeddings.append(getEmbeddings(spec))
    
    start = start + chunk_size
    print(start)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000


In [10]:
failures = [num for num in range(len(embeddings)) if embeddings[num].shape == ()]

while len(failures) != 0:
    
    print(failures)
    
    for num in failures:
        spec = {'data':data,'start':num*1000,'end':min(end,(num+1)*1000),
                              'model_id':model_id,'hf_token':hf_token,
                               'api_url':api_url,'headers':headers}

        embeddings[num] = getEmbeddings(spec)
    
        print(num)
    
    failures = [num for num in range(len(embeddings)) if embeddings[num].shape == ()]

In [12]:
embeddings = np.vstack(embeddings)

In [14]:
pickle.dump(embeddings,open('genai_tweet_embeddings.pkl','wb'))