In [175]:
import glob
import os
import ast
import pandas as pd
import google.cloud.aiplatform as aiplatform
%reload_ext autoreload
%autoreload 2
from utils import generate_prompt_categories,obtain_results_gemini_uid_dict,generate_prompt,generate_prompt_batch,run_llm_label_flow,load_in_data,grab_specific_tag_data_breakdown_test_and_train,chunk_dataframe_into_batches,run_llm_label_flow_gemini
from constants import query_tags,sentiment_dict
from sklearn.metrics import classification_report
pd.set_option('max_colwidth', 800)

In [211]:
test = [['apples','oranges','sugar_cookies'],['sugar_cookies','apples']]

In [205]:
test_df = pd.DataFrame()
test_df['values'] = test

In [212]:
list(test_df['values'].explode())

['apples', 'oranges', 'sugar_cookies']

In [2]:
output_format = "{'link':'sentiment'}"

In [195]:
# To add to utils 
def combine_results_data(all_data,exclude_ids,results_df):
    data_excluded = all_data.loc[all_data['uid'].isin(exclude_ids)].reset_index(drop=True)
    text_map = {text:label for text,label in zip(results_df['text'],results_df['llm_label'])}
    data_excluded['llm_label'] = data_excluded['text'].apply(lambda x: text_map[x] if x in text_map.keys() else None).reset_index(drop=True)
    data_excluded = data_excluded.loc[~data_excluded['llm_label'].isna()].reset_index(drop=True)
    all_results = pd.concat([results_df,data_excluded]).reset_index(drop=True)
    return all_results


def make_preds(query,tag):
    # Load in main data
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt(query.title(), tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')

def make_preds_categories(query,tag):
    # Load in main data
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt_categories(tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')
    

# Glasses 

In [74]:
glasses_data = load_in_data('glasses')

# Google Testing 

In [77]:
# 1. Testing reliability on test set 
google_glasses_test = pd.read_csv('../data/glasses_test_data/google_glasses_test.csv')
google_glasses_test = google_glasses_test.rename(columns={'Snippet':'text','Url':'uid'})

In [78]:
test_results = run_llm_label_flow_gemini(sentiment_dict, gg_prompt, list(sentiment_dict.keys()), google_glasses_test, label_name='llm_label')

100%|███████████████████████████████████████████| 50/50 [00:05<00:00,  9.57it/s]


## Google 

### Grab data

In [82]:
google_glasses_data, unique_google_glasses_data, excluded_uid_google_glasses, google_glasses_train, google_glasses_test = grab_specific_tag_data_breakdown_test_and_train(glasses_data,'google')

In [85]:
# For Batch processing if necassery
# google_glasses_batched = chunk_dataframe_into_batches(google_glasses_train,20)
google_glasses_train_sample = google_glasses_train.sample(n=5000,random_state=42).reset_index(drop=True)

### Make Predictions 

In [86]:
gg_prompt = generate_prompt('Glasses', 'google')
results = run_llm_label_flow_gemini(sentiment_dict, gg_prompt, list(sentiment_dict.keys()), google_glasses_train, label_name='llm_label')

100%|███████████████████████████████████████| 5334/5334 [09:18<00:00,  9.56it/s]


In [91]:
all_google_glass_predictions = combine_results_data(google_glasses_data,excluded_uid_google_glasses,results)

#### Compare sentiment breakdown 

In [94]:
# OG Train sample results
results['llm_label'].value_counts()/len(results)

llm_label
1    0.473147
0    0.393722
2    0.133132
Name: count, dtype: float64

In [95]:
# Combined duplicate text results
all_google_glass_predictions['llm_label'].value_counts()/len(all_google_glass_predictions)

llm_label
0.0    0.452991
1.0    0.411084
2.0    0.135925
Name: count, dtype: float64

In [100]:
all_google_glass_predictions.to_json('../data/predictions/glasses/google.json',orient='records',lines=True)

### Apple 

In [101]:
apple_glasses_data, unique_apple_glasses_data, excluded_uid_apple_glasses, apple_glasses_train, apple_glasses_test = grab_specific_tag_data_breakdown_test_and_train(glasses_data,'apple')

In [108]:
# Note not sampling down as it is to low 

In [109]:
prompt = generate_prompt('Glasses', 'Apple')
results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), apple_glasses_train, label_name='llm_label')

100%|███████████████████████████████████████| 4218/4218 [07:20<00:00,  9.58it/s]


In [110]:
all_apple_glass_predictions = combine_results_data(apple_glasses_data,excluded_uid_apple_glasses,results)

In [111]:
results['llm_label'].value_counts()/len(results)

llm_label
1    0.537367
2    0.241756
0    0.220878
Name: count, dtype: float64

In [112]:
all_apple_glass_predictions['llm_label'].value_counts()/len(all_apple_glass_predictions)

llm_label
1.0    0.493691
2.0    0.309046
0.0    0.197263
Name: count, dtype: float64

In [115]:
len(all_apple_glass_predictions)

5627

In [117]:
all_apple_glass_predictions.to_json('../data/predictions/glasses/apple.json',orient='records',lines=True)

# Creating pipeline workflow for fast processing 


args: query,tag

In [120]:
def make_preds(query,tag):
    # Load in main data
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt(query.title(), tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')
    

# Lenovo


In [121]:
make_preds('glasses','lenovo')

100%|███████████████████████████████████████| 1315/1315 [02:17<00:00,  9.55it/s]


The label distribution for glasses lenovo is: 
 with singular results
llm_label
1    0.860076
2    0.114068
0    0.025856
Name: count, dtype: float64
The label distribution for glasses lenovo is: 
 with all results
llm_label
1.0    0.858323
2.0    0.128296
0.0    0.013381
Name: count, dtype: float64
Complete


### Completing rest for glasses 

In [123]:
# Go do and make preds for all

tags = ['meta','rokid','snap','vuzix','xreal']
querys = ['glasses' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    make_preds(query,tag)


100%|███████████████████████████████████████| 4641/4641 [08:04<00:00,  9.57it/s]


The label distribution for glasses meta is: 
 with singular results
llm_label
1    0.673716
0    0.176090
2    0.150194
Name: count, dtype: float64
The label distribution for glasses meta is: 
 with all results
llm_label
1.0    0.659540
0.0    0.177088
2.0    0.163372
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 3347/3347 [05:52<00:00,  9.50it/s]


The label distribution for glasses rokid is: 
 with singular results
llm_label
1    0.630051
2    0.202634
0    0.167315
Name: count, dtype: float64
The label distribution for glasses rokid is: 
 with all results
llm_label
1.0    0.499560
2.0    0.401621
0.0    0.098820
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 4206/4206 [07:20<00:00,  9.55it/s]


The label distribution for glasses snap is: 
 with singular results
llm_label
1    0.499049
2    0.474310
0    0.026641
Name: count, dtype: float64
The label distribution for glasses snap is: 
 with all results
llm_label
1.0    0.499397
2.0    0.476889
0.0    0.023714
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 1318/1318 [02:19<00:00,  9.41it/s]


The label distribution for glasses vuzix is: 
 with singular results
llm_label
2    0.532625
1    0.458270
0    0.009105
Name: count, dtype: float64
The label distribution for glasses vuzix is: 
 with all results
llm_label
2.0    0.630216
1.0    0.364029
0.0    0.005755
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 4257/4257 [07:41<00:00,  9.22it/s]


The label distribution for glasses xreal is: 
 with singular results
llm_label
1    0.662506
2    0.190768
0    0.146726
Name: count, dtype: float64
The label distribution for glasses xreal is: 
 with all results
llm_label
2.0    0.463628
1.0    0.450047
0.0    0.086325
Name: count, dtype: float64
Complete


# Load in all glasses data 

In [128]:
glasses_predictions = glob.glob('../data/predictions/glasses/*')
data = []
for path in glasses_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_glasses_preds = pd.concat(data).reset_index(drop=True)
all_glasses_preds['query'] = 'glasses'
all_glasses_preds.to_json('../data/predictions/final_preds/glasses.json',orient='records',lines=True)

# Data Quality Check 

In [149]:
all_glasses_preds.loc[(all_glasses_preds['tag']=='google') & (all_glasses_preds['llm_label']==1) ][['text']]

Unnamed: 0,text
22669,RT @steepler @bexhillmuseum @WollastonMuseum I did a Google glass search on it and that’s the most likely thing.
22673,"@GoogleARVR @peregrau1969 @unity Google augmented reality glasses project: These are four links about mixed reality technologies. Therefore, we ask engineers and workers to conduct research in order to take advantage of all technologies and innovations and create hologram projects &"
22674,RT @verge Google Glass Enterprise Edition is no more https://t.co/EiKr87RM72 https://t.co/zO3dkonuT9
22677,"@MKBHD The same basic idea of google glass, but covering the whole face..."
22679,"@tnatw Use of chat with Google glasses, for jury selection to read micro expressions. Then constantly redirecting chat throughout the trial, to cater questioning and objections to the flow of information. Smarter lawyers will be a must, parameters will change for every trial."
...,...
29915,Google Glass
29916,Google Glass.
29917,Google Glass.
29918,Google Glass


In [154]:
{'android':[('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchgenpop'), ('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchAndroidONLY', 'Android NOT Apple [GTM 2.0]'), ('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchAppleONLY', 'Apple NOT Android [GTM 2.0]'), ('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchApplebrand', 'Apple overlay (inclusive of Android) [GTM 2.0]'), ('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchAndroidbrand', 'Android overlay (inclusive of Apple) [GTM 2.0]')]}







"{'android’:[('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchgenpop'), ('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchAndroidONLY', 'Android NOT Apple [GTM 2.0]'), ('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchAppleONLY', 'Apple NOT Android [GTM 2.0]'), ('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchApplebrand', 'Apple overlay (inclusive of Android) [GTM 2.0]'), ('https://app.brandwatch.com/project/1998294717/explore/2001813276/measure/volume', 'GTM_prelaunchAndroidbrand', 'Android overlay (inclusive of Apple) [GTM 2.0]’)]}"

# Headsets 

In [153]:
tags = ['apple',
 'dpvr',
 'google',
 'hp',
 'htc',
 'magic_leap',
 'meta',
 'microsoft',
 'oppo',
 'pico',
 'samsung',
 'sony',
 'valve']
querys = ['headsets' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    make_preds(query,tag)


100%|███████████████████████████████████████| 5000/5000 [08:42<00:00,  9.57it/s]


The label distribution for headsets apple is: 
 with singular results
llm_label
1    0.678593
2    0.182111
0    0.139296
Name: count, dtype: float64
The label distribution for headsets apple is: 
 with all results
llm_label
1.0    0.650879
2.0    0.223766
0.0    0.125356
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 2078/2078 [03:38<00:00,  9.50it/s]


The label distribution for headsets dpvr is: 
 with singular results
llm_label
1    0.625723
2    0.296243
0    0.078035
Name: count, dtype: float64
The label distribution for headsets dpvr is: 
 with all results
llm_label
1.0    0.646261
2.0    0.294957
0.0    0.058783
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:46<00:00,  9.49it/s]


The label distribution for headsets google is: 
 with singular results
llm_label
1    0.792434
0    0.168535
2    0.039031
Name: count, dtype: float64
The label distribution for headsets google is: 
 with all results
llm_label
1.0    0.759879
0.0    0.206023
2.0    0.034098
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 3870/3870 [06:47<00:00,  9.50it/s]


The label distribution for headsets hp is: 
 with singular results
llm_label
1    0.682958
0    0.182312
2    0.134730
Name: count, dtype: float64
The label distribution for headsets hp is: 
 with all results
llm_label
1.0    0.719380
0.0    0.153707
2.0    0.126914
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:45<00:00,  9.51it/s]


The label distribution for headsets htc is: 
 with singular results
llm_label
1    0.615708
2    0.246444
0    0.137848
Name: count, dtype: float64
The label distribution for headsets htc is: 
 with all results
llm_label
1.0    0.603147
2.0    0.275463
0.0    0.121390
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 3028/3028 [05:17<00:00,  9.53it/s]


The label distribution for headsets magic_leap is: 
 with singular results
llm_label
1    0.633719
2    0.250248
0    0.116033
Name: count, dtype: float64
The label distribution for headsets magic_leap is: 
 with all results
llm_label
1.0    0.586067
2.0    0.317569
0.0    0.096364
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:47<00:00,  9.48it/s]


The label distribution for headsets meta is: 
 with singular results
llm_label
1    0.583786
2    0.256286
0    0.159928
Name: count, dtype: float64
The label distribution for headsets meta is: 
 with all results
llm_label
1.0    0.554833
2.0    0.297922
0.0    0.147245
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:46<00:00,  9.50it/s]


The label distribution for headsets microsoft is: 
 with singular results
llm_label
1    0.832799
0    0.142141
2    0.025060
Name: count, dtype: float64
The label distribution for headsets microsoft is: 
 with all results
llm_label
1.0    0.820992
0.0    0.138927
2.0    0.040082
Name: count, dtype: float64
Complete


100%|█████████████████████████████████████████| 255/255 [00:26<00:00,  9.50it/s]


The label distribution for headsets oppo is: 
 with singular results
llm_label
1    0.925490
2    0.066667
0    0.007843
Name: count, dtype: float64
The label distribution for headsets oppo is: 
 with all results
llm_label
1.0    0.843243
2.0    0.151351
0.0    0.005405
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:45<00:00,  9.51it/s]


The label distribution for headsets pico is: 
 with singular results
llm_label
1    0.605084
2    0.259408
0    0.135508
Name: count, dtype: float64
The label distribution for headsets pico is: 
 with all results
llm_label
1.0    0.585269
2.0    0.297195
0.0    0.117536
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:44<00:00,  9.53it/s]


The label distribution for headsets samsung is: 
 with singular results
llm_label
1    0.776376
0    0.163363
2    0.060260
Name: count, dtype: float64
The label distribution for headsets samsung is: 
 with all results
llm_label
1.0    0.792044
0.0    0.150931
2.0    0.057026
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:52<00:00,  9.39it/s]


The label distribution for headsets sony is: 
 with singular results
llm_label
1    0.794584
0    0.114142
2    0.091274
Name: count, dtype: float64
The label distribution for headsets sony is: 
 with all results
llm_label
1.0    0.808416
0.0    0.100582
2.0    0.091002
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:48<00:00,  9.46it/s]


The label distribution for headsets valve is: 
 with singular results
llm_label
1    0.562575
2    0.253911
0    0.183514
Name: count, dtype: float64
The label distribution for headsets valve is: 
 with all results
llm_label
1.0    0.557583
2.0    0.277201
0.0    0.165216
Name: count, dtype: float64
Complete


# Categories 

### Unlike the other queries we need to adjust the prompt---using Gemini we'll optimize the prompt for best results based on true labels


In [191]:
cat_data = load_in_data('categories')

In [192]:
df,df_unique_snippets,excluded_uid_from_unique, df_train, df_test = grab_specific_tag_data_breakdown_test_and_train(cat_data,'apple')

In [187]:
df['source'].unique()

array(['manual_export', 'reddit'], dtype=object)

In [172]:
df_test.to_csv('../data/categories_test_data/apple_test_data.csv')

In [193]:
# Step 1 run results with gemini 
prompt = generate_prompt_categories('Apple')
results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), df_test, label_name='llm_label')

100%|███████████████████████████████████████████| 50/50 [00:05<00:00,  9.58it/s]


In [194]:
results.to_csv('../data/categories_test_data/apple_test_data.csv')

In [190]:
df['pageType'].unique()

array([nan, 'reddit'], dtype=object)

In [196]:
tags = ['amazon','apple','google','meta','microsoft']
querys = ['categories' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    make_preds_categories(query,tag)

100%|███████████████████████████████████████| 5000/5000 [08:41<00:00,  9.58it/s]


The label distribution for categories amazon is: 
 with singular results
llm_label
1    0.768676
2    0.153215
0    0.078109
Name: count, dtype: float64
The label distribution for categories amazon is: 
 with all results
llm_label
1.0    0.767809
2.0    0.160416
0.0    0.071775
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:42<00:00,  9.58it/s]


The label distribution for categories apple is: 
 with singular results
llm_label
1    0.476697
2    0.353757
0    0.169546
Name: count, dtype: float64
The label distribution for categories apple is: 
 with all results
llm_label
1.0    0.448908
2.0    0.383416
0.0    0.167676
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:42<00:00,  9.57it/s]


The label distribution for categories google is: 
 with singular results
llm_label
1    0.641077
2    0.235732
0    0.123191
Name: count, dtype: float64
The label distribution for categories google is: 
 with all results
llm_label
1.0    0.629382
2.0    0.253571
0.0    0.117047
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:43<00:00,  9.55it/s]


The label distribution for categories meta is: 
 with singular results
llm_label
1    0.519864
2    0.333266
0    0.146870
Name: count, dtype: float64
The label distribution for categories meta is: 
 with all results
llm_label
1.0    0.495747
2.0    0.356742
0.0    0.147511
Name: count, dtype: float64
Complete


100%|███████████████████████████████████████| 5000/5000 [08:42<00:00,  9.57it/s]


The label distribution for categories microsoft is: 
 with singular results
llm_label
1    0.643287
2    0.196593
0    0.160120
Name: count, dtype: float64
The label distribution for categories microsoft is: 
 with all results
llm_label
1.0    0.634950
2.0    0.210292
0.0    0.154757
Name: count, dtype: float64
Complete


# Load in final preds data 

In [197]:
headsets_predictions = glob.glob('../data/predictions/headsets/*')
data = []
for path in headsets_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_headsets_preds = pd.concat(data).reset_index(drop=True)
all_headsets_preds['query'] = 'headsets'
all_headsets_preds.to_json('../data/predictions/final_preds/headsets.json',orient='records',lines=True)

In [199]:
categories_predictions = glob.glob('../data/predictions/categories/*')
data = []
for path in categories_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_cat_preds = pd.concat(data).reset_index(drop=True)
all_cat_preds['query'] = 'categories'
all_cat_preds.to_json('../data/predictions/final_preds/categories.json',orient='records',lines=True)

# Combine ALL predictions into one single dataframe 

In [200]:
all_predictions = glob.glob('../data/predictions/final_preds/*')
data = []
for path in all_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_preds = pd.concat(data).reset_index(drop=True)
all_preds.to_json('../data/predictions/final_preds/all_preds.json',orient='records',lines=True)