# 1. Import Packages and Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import keras as k

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer

import itertools as it
import ast
import pickle
from sklearn.base import clone
import datetime
import os

import multiprocessing as mp
import gc
import requests
%matplotlib inline

# 2. Read in Data -- Raw Text + Embeddings

In [2]:
raw_text = pd.read_csv('../GenerativeAI tweets.csv')
raw_text['Text'] = raw_text['Text'].str.lower().astype(str)
display(raw_text)

embeddings = pickle.load(open('genai_tweet_embeddings.pkl','rb'))
display(embeddings)

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username
0,0,2023-04-19 21:27:19+00:00,1648800467206672384,from studio gangster to synthetic gangster 🎤.....,resembleai
1,1,2023-04-19 21:27:09+00:00,1648800425540476929,took me some time to find this. i build this #...,devaanparbhoo
2,2,2023-04-19 21:26:57+00:00,1648800376479715328,mind blowing next wave #generativeai platform...,timreha
3,3,2023-04-19 21:26:49+00:00,1648800341193027584,open source generative ai image specialist sta...,VirtReview
4,4,2023-04-19 21:25:00+00:00,1648799883934203905,are you an #hr leader considering which future...,FrozeElle
...,...,...,...,...,...
56216,56216,2022-04-24 16:40:01+00:00,1518268535276904448,"understanding generative ai, its impacts and l...",analyticsinme
56217,56217,2022-04-23 07:23:24+00:00,1517766068592381952,y ya puedes empezar a crear #arte con @thegeni...,iia_es
56218,56218,2022-04-22 08:20:21+00:00,1517418013812830208,"nvidia researchers have developed ganverse3d, ...",VideoGenAI
56219,56219,2022-04-21 13:15:21+00:00,1517129866403008512,tech trend 2022: เทรนด์เทคโนโลยีสำหรับปี 2022 ...,sitthinuntp


array([[-0.08370978, -0.0339897 ,  0.00047592, ..., -0.0230935 ,
         0.14363675, -0.0207369 ],
       [-0.13965364, -0.04326905, -0.01789731, ...,  0.01601694,
         0.02931033,  0.02207875],
       [-0.06304073,  0.0408164 ,  0.01617974, ...,  0.04231761,
        -0.02437175,  0.01193745],
       ...,
       [-0.05494443, -0.07396349, -0.02731166, ..., -0.00126567,
         0.00950922,  0.03049315],
       [-0.10864393,  0.04528776,  0.00032811, ..., -0.05297729,
        -0.01807566, -0.04011416],
       [-0.06801771, -0.03064705,  0.09605097, ...,  0.01945854,
        -0.03104803,  0.01419436]])

# 3. Generate Topic Vectors

In [9]:
files = [f'../subtasks/topic_estimation/HFEmbeddingModelsAbstracts/{x}' for x in 
 os.listdir('../subtasks/topic_estimation/HFEmbeddingModelsAbstracts/')]

models = [x for x in files if '.h5' in x]

topic_results = pd.DataFrame()

In [10]:
def scorer_nn1(ytrue, ypred):
    
    ytrue = ytrue.numpy()
    ypred = (ypred.numpy() >= 0.5).astype(int)
    
    confuse = confusion_matrix(ytrue, ypred)
    
    score = (confuse.diagonal() / confuse.sum(axis = 1)).mean()
    
    if pd.isnull(score):
        score = 0
    return score

In [14]:
for fp in models:
    
    model = k.models.load_model(fp, compile = True, custom_objects = {'scorer_nn1':scorer_nn1})
    thresh = pickle.load(open(fp.replace('.h5','.pkl'),'rb'))['best_thresh']
    subject = fp.split('/')[-1].replace('.h5','')
    
    probs = model.predict(embeddings).flatten()
    preds = np.array([1 if x>=thresh else 0 for x in probs])
    
    topic_results[subject] = preds

2023-06-14 14:10:41.024057: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.




In [17]:
topic_results.columns

Index(['Distributed, Parallel, and Cluster Computing',
       'Artificial Intelligence', 'Social and Information Networks',
       'Computer Vision and Pattern Recognition', 'Statistics Theory',
       'Optimization and Control', 'Graphics', 'Audio and Speech Processing',
       'Robotics', 'Image and Video Processing', 'Human-Computer Interaction',
       'Quantitative Methods', 'Applications', 'Machine Learning',
       'Cryptography and Security', 'Multiagent Systems',
       'Information Retrieval', 'Computers and Society', 'Signal Processing',
       'Computation and Language', 'Systems and Control', 'Methodology',
       'Multimedia', 'Neural and Evolutionary Computing',
       'Information Theory'],
      dtype='object')

In [53]:
raw_text[topic_results['Human-Computer Interaction'] == 0]['Text'].iloc[0]

'took me some time to find this. i build this #nocode #prototype in dec 2018. it’s a reality today, #botsociety #generativeai #ai #gpt https://t.co/1g2jdb3deg'

In [62]:
pickle.dump(topic_results,open('initial_topic_vectors.pkl','wb'))