In [1]:
import glob
import os
import ast
import pandas as pd
import google.cloud.aiplatform as aiplatform
%reload_ext autoreload
%autoreload 2
from utils import generate_prompt_categories,obtain_results_gemini_uid_dict,generate_prompt,generate_prompt_batch,run_llm_label_flow,load_in_data,grab_specific_tag_data_breakdown_test_and_train,chunk_dataframe_into_batches,run_llm_label_flow_gemini
from constants import query_tags,sentiment_dict
from sklearn.metrics import classification_report
pd.set_option('max_colwidth', 800)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# To add to utils 
def combine_results_data(all_data,exclude_ids,results_df):
    data_excluded = all_data.loc[all_data['uid'].isin(exclude_ids)].reset_index(drop=True)
    text_map = {text:label for text,label in zip(results_df['text'],results_df['llm_label'])}
    data_excluded['llm_label'] = data_excluded['text'].apply(lambda x: text_map[x] if x in text_map.keys() else None).reset_index(drop=True)
    data_excluded = data_excluded.loc[~data_excluded['llm_label'].isna()].reset_index(drop=True)
    all_results = pd.concat([results_df,data_excluded]).reset_index(drop=True)
    return all_results


def make_preds(query,tag):
    # Load in main data
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt(query.title(), tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')

def make_preds_categories(query,tag):
    # Load in main data
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt_categories(tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')
    

# Creating pipeline workflow for fast processing 


args: query,tag

In [3]:
def make_preds(query,tag):
    # Load in main data
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt(query.title(), tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')
    

In [17]:
glob.glob(f"../data/glasses/*.csv")[0].replace("Glasses_",'')

'../data/glasses/Snap.csv'

In [22]:
# Rename all files 
# Rename files in directory
import logging
import glob
def rename_files(tag):
    file_paths = glob.glob(f"../data/{tag}/*.csv")
    for file in file_paths:
        os.system(f" mv {file} {file.replace(tag.title()+'_','').lower()}")
        logging.info(f"Finished processing for {tag}")

    


        

In [25]:
for file in glob.glob(f"../data/categories/*.csv"):
    os.system(f" mv {file} {file.replace('category','categories')} ")

In [40]:
data = pd.read_csv(f"../data/glasses/google.csv",header=10)

# Glasses 

In [49]:
# Glasses 
# Go do and make preds for all

tags = ['meta','rokid','snap','vuzix','xreal','google','apple','lenovo']
querys = ['glasses' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    print(query,tag)
    make_preds(query,tag)

glasses meta


100%|███████████████████████████████████████| 5000/5000 [08:41<00:00,  9.59it/s]


The label distribution for glasses meta is: 
 with singular results
llm_label
1    0.689121
2    0.163598
0    0.147280
Name: count, dtype: float64
The label distribution for glasses meta is: 
 with all results
llm_label
1.0    0.691185
2.0    0.161905
0.0    0.146910
Name: count, dtype: float64
Complete
glasses rokid


100%|███████████████████████████████████████| 3252/3252 [05:45<00:00,  9.40it/s]


The label distribution for glasses rokid is: 
 with singular results
llm_label
1    0.637820
2    0.186018
0    0.176163
Name: count, dtype: float64
The label distribution for glasses rokid is: 
 with all results
llm_label
1    0.637852
2    0.197875
0    0.164273
Name: count, dtype: float64
Complete
glasses snap


100%|███████████████████████████████████████| 2966/2966 [05:09<00:00,  9.58it/s]


The label distribution for glasses snap is: 
 with singular results
llm_label
1    0.598235
2    0.201561
0    0.200204
Name: count, dtype: float64
The label distribution for glasses snap is: 
 with all results
llm_label
1    0.605329
0    0.207210
2    0.187461
Name: count, dtype: float64
Complete
glasses vuzix


100%|███████████████████████████████████████| 1046/1046 [01:49<00:00,  9.58it/s]


The label distribution for glasses vuzix is: 
 with singular results
llm_label
1    0.594646
2    0.381453
0    0.023901
Name: count, dtype: float64
The label distribution for glasses vuzix is: 
 with all results
llm_label
1.0    0.586081
2.0    0.391026
0.0    0.022894
Name: count, dtype: float64
Complete
glasses xreal


100%|███████████████████████████████████████| 5000/5000 [08:41<00:00,  9.58it/s]


The label distribution for glasses xreal is: 
 with singular results
llm_label
1    0.637948
2    0.218193
0    0.143859
Name: count, dtype: float64
The label distribution for glasses xreal is: 
 with all results
llm_label
1.0    0.662888
2.0    0.205850
0.0    0.131261
Name: count, dtype: float64
Complete
glasses google


100%|███████████████████████████████████████| 5000/5000 [08:42<00:00,  9.58it/s]


The label distribution for glasses google is: 
 with singular results
llm_label
0    0.440927
1    0.437097
2    0.121976
Name: count, dtype: float64
The label distribution for glasses google is: 
 with all results
llm_label
1.0    0.461773
0.0    0.421073
2.0    0.117155
Name: count, dtype: float64
Complete
glasses apple


100%|███████████████████████████████████████| 5000/5000 [08:42<00:00,  9.57it/s]


The label distribution for glasses apple is: 
 with singular results
llm_label
1    0.552152
0    0.249049
2    0.198799
Name: count, dtype: float64
The label distribution for glasses apple is: 
 with all results
llm_label
1.0    0.559130
0.0    0.246358
2.0    0.194513
Name: count, dtype: float64
Complete
glasses lenovo


100%|███████████████████████████████████████| 1211/1211 [02:06<00:00,  9.57it/s]


The label distribution for glasses lenovo is: 
 with singular results
llm_label
1    0.878613
2    0.089182
0    0.032205
Name: count, dtype: float64
The label distribution for glasses lenovo is: 
 with all results
llm_label
1    0.883065
2    0.090726
0    0.026210
Name: count, dtype: float64
Complete


# Load in all glasses data 

In [52]:
glasses_predictions = glob.glob('../data/predictions/glasses/*')
data = []
for path in glasses_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_glasses_preds = pd.concat(data).reset_index(drop=True)
all_glasses_preds['query'] = 'glasses'
all_glasses_preds.to_json('../data/predictions/final_preds/glasses.json',orient='records',lines=True)

# Data Quality Check 

In [149]:
all_glasses_preds.loc[(all_glasses_preds['tag']=='google') & (all_glasses_preds['llm_label']==1) ][['text']]

Unnamed: 0,text
22669,RT @steepler @bexhillmuseum @WollastonMuseum I did a Google glass search on it and that’s the most likely thing.
22673,"@GoogleARVR @peregrau1969 @unity Google augmented reality glasses project: These are four links about mixed reality technologies. Therefore, we ask engineers and workers to conduct research in order to take advantage of all technologies and innovations and create hologram projects &"
22674,RT @verge Google Glass Enterprise Edition is no more https://t.co/EiKr87RM72 https://t.co/zO3dkonuT9
22677,"@MKBHD The same basic idea of google glass, but covering the whole face..."
22679,"@tnatw Use of chat with Google glasses, for jury selection to read micro expressions. Then constantly redirecting chat throughout the trial, to cater questioning and objections to the flow of information. Smarter lawyers will be a must, parameters will change for every trial."
...,...
29915,Google Glass
29916,Google Glass.
29917,Google Glass.
29918,Google Glass


# Headsets 

In [4]:
tags = ['apple',
 'dpvr',
 'google',
 'hp',
 'htc',
 'magic_leap',
 'meta',
 'microsoft',
 'oppo',
 'pico',
 'samsung',
 'sony',
 'valve']
querys = ['headsets' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    make_preds(query,tag)


100%|███████████████████████████████████████| 5000/5000 [08:40<00:00,  9.61it/s]


The label distribution for headsets apple is: 
 with singular results
llm_label
1    0.745528
0    0.128643
2    0.125829
Name: count, dtype: float64
The label distribution for headsets apple is: 
 with all results
llm_label
1.0    0.743890
0.0    0.128949
2.0    0.127161
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 2190/2190 [03:48<00:00,  9.60it/s]


The label distribution for headsets dpvr is: 
 with singular results
llm_label
1    0.616263
2    0.308817
0    0.074920
Name: count, dtype: float64
The label distribution for headsets dpvr is: 
 with all results
llm_label
1.0    0.649784
2.0    0.285826
0.0    0.064389
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:46<00:00,  9.50it/s]


The label distribution for headsets google is: 
 with singular results
llm_label
1    0.735681
0    0.215139
2    0.049180
Name: count, dtype: float64
The label distribution for headsets google is: 
 with all results
llm_label
1.0    0.740815
0.0    0.197526
2.0    0.061659
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 3882/3882 [06:44<00:00,  9.60it/s]


The label distribution for headsets hp is: 
 with singular results
llm_label
1    0.691772
0    0.191127
2    0.117101
Name: count, dtype: float64
The label distribution for headsets hp is: 
 with all results
llm_label
1    0.693060
0    0.192082
2    0.114858
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:40<00:00,  9.61it/s]


The label distribution for headsets htc is: 
 with singular results
llm_label
1    0.606856
2    0.232558
0    0.160585
Name: count, dtype: float64
The label distribution for headsets htc is: 
 with all results
llm_label
1.0    0.612249
2.0    0.230739
0.0    0.157012
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 3694/3694 [06:24<00:00,  9.60it/s]


The label distribution for headsets magic_leap is: 
 with singular results
llm_label
1    0.610344
2    0.272136
0    0.117520
Name: count, dtype: float64
The label distribution for headsets magic_leap is: 
 with all results
llm_label
1.0    0.610971
2.0    0.278311
0.0    0.110718
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:40<00:00,  9.60it/s]


The label distribution for headsets meta is: 
 with singular results
llm_label
1    0.595740
2    0.223629
0    0.180631
Name: count, dtype: float64
The label distribution for headsets meta is: 
 with all results
llm_label
1.0    0.600000
2.0    0.222113
0.0    0.177887
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:40<00:00,  9.60it/s]


The label distribution for headsets microsoft is: 
 with singular results
llm_label
1    0.843531
0    0.137212
2    0.019258
Name: count, dtype: float64
The label distribution for headsets microsoft is: 
 with all results
llm_label
1.0    0.849783
0.0    0.129647
2.0    0.020570
Name: count, dtype: float64
Complete


100%|█████████████████████████████████████████| 248/248 [00:25<00:00,  9.60it/s]


The label distribution for headsets oppo is: 
 with singular results
llm_label
1    0.927419
2    0.064516
0    0.008065
Name: count, dtype: float64
The label distribution for headsets oppo is: 
 with all results
llm_label
1.0    0.932331
2.0    0.060150
0.0    0.007519
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:55<00:00,  9.34it/s]


The label distribution for headsets pico is: 
 with singular results
llm_label
1    0.566647
2    0.251153
0    0.182201
Name: count, dtype: float64
The label distribution for headsets pico is: 
 with all results
llm_label
1.0    0.567860
2.0    0.253344
0.0    0.178796
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:45<00:00,  9.52it/s]


The label distribution for headsets samsung is: 
 with singular results
llm_label
1    0.836003
0    0.128955
2    0.035042
Name: count, dtype: float64
The label distribution for headsets samsung is: 
 with all results
llm_label
1.0    0.844440
0.0    0.122333
2.0    0.033226
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:42<00:00,  9.56it/s]


The label distribution for headsets sony is: 
 with singular results
llm_label
1    0.825869
0    0.118899
2    0.055232
Name: count, dtype: float64
The label distribution for headsets sony is: 
 with all results
llm_label
1.0    0.826512
0.0    0.118187
2.0    0.055300
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:39<00:00,  9.62it/s]


The label distribution for headsets valve is: 
 with singular results
llm_label
1    0.579739
0    0.210632
2    0.209629
Name: count, dtype: float64
The label distribution for headsets valve is: 
 with all results
llm_label
1.0    0.582754
0.0    0.209020
2.0    0.208226
Name: count, dtype: float64
Complete


# Categories 

### Unlike the other queries we need to adjust the prompt---using Gemini we'll optimize the prompt for best results based on true labels


In [None]:
cat_data = load_in_data('categories')

In [None]:
df,df_unique_snippets,excluded_uid_from_unique, df_train, df_test = grab_specific_tag_data_breakdown_test_and_train(cat_data,'apple')

In [187]:
df['source'].unique()

array(['manual_export', 'reddit'], dtype=object)

In [172]:
df_test.to_csv('../data/categories_test_data/apple_test_data.csv')

In [193]:
# Step 1 run results with gemini 
prompt = generate_prompt_categories('Apple')
results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), df_test, label_name='llm_label')

100%|███████████████████████████████████████████| 50/50 [00:05<00:00,  9.58it/s]


In [194]:
results.to_csv('../data/categories_test_data/apple_test_data.csv')

In [190]:
df['pageType'].unique()

array([nan, 'reddit'], dtype=object)

In [5]:
tags = ['amazon','apple','google','meta','microsoft']
querys = ['categories' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    make_preds_categories(query,tag)

100%|███████████████████████████████████████| 5000/5000 [08:44<00:00,  9.54it/s]


The label distribution for categories amazon is: 
 with singular results
llm_label
1    0.753007
2    0.150361
0    0.096632
Name: count, dtype: float64
The label distribution for categories amazon is: 
 with all results
llm_label
1.0    0.762230
2.0    0.149395
0.0    0.088375
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:55<00:00,  9.34it/s]


The label distribution for categories apple is: 
 with singular results
llm_label
1    0.451678
2    0.340768
0    0.207555
Name: count, dtype: float64
The label distribution for categories apple is: 
 with all results
llm_label
1.0    0.453103
2.0    0.340869
0.0    0.206028
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:49<00:00,  9.44it/s]


The label distribution for categories google is: 
 with singular results
llm_label
1    0.657487
2    0.200804
0    0.141709
Name: count, dtype: float64
The label distribution for categories google is: 
 with all results
llm_label
1.0    0.66419
2.0    0.19981
0.0    0.13600
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:58<00:00,  9.28it/s]


The label distribution for categories meta is: 
 with singular results
llm_label
1    0.536008
2    0.294885
0    0.169107
Name: count, dtype: float64
The label distribution for categories meta is: 
 with all results
llm_label
1.0    0.537381
2.0    0.294699
0.0    0.167919
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:57<00:00,  9.31it/s]


The label distribution for categories microsoft is: 
 with singular results
llm_label
1    0.633628
0    0.194623
2    0.171750
Name: count, dtype: float64
The label distribution for categories microsoft is: 
 with all results
llm_label
1.0    0.646162
0.0    0.181751
2.0    0.172087
Name: count, dtype: float64
Complete


# Load in final preds data 

In [6]:
headsets_predictions = glob.glob('../data/predictions/headsets/*')
data = []
for path in headsets_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_headsets_preds = pd.concat(data).reset_index(drop=True)
all_headsets_preds['query'] = 'headsets'
all_headsets_preds.to_json('../data/predictions/final_preds/headsets.json',orient='records',lines=True)

In [7]:
categories_predictions = glob.glob('../data/predictions/categories/*')
data = []
for path in categories_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_cat_preds = pd.concat(data).reset_index(drop=True)
all_cat_preds['query'] = 'categories'
all_cat_preds.to_json('../data/predictions/final_preds/categories.json',orient='records',lines=True)

# Combine ALL predictions into one single dataframe 

In [8]:
all_predictions = glob.glob('../data/predictions/final_preds/*')
data = []
for path in all_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_preds = pd.concat(data).reset_index(drop=True)
all_preds.to_json('../data/predictions/final_preds/all_preds.json',orient='records',lines=True)