In [25]:
import glob
import os
import ast
import pandas as pd
import google.cloud.aiplatform as aiplatform
%reload_ext autoreload
%autoreload 2
from utils import generate_prompt_categories,obtain_results_gemini_uid_dict,generate_prompt,generate_prompt_batch,run_llm_label_flow,load_in_data,grab_specific_tag_data_breakdown_test_and_train,chunk_dataframe_into_batches,run_llm_label_flow_gemini
from constants import query_tags,sentiment_dict
from sklearn.metrics import classification_report
pd.set_option('max_colwidth', 800)

In [26]:
# To add to utils 
def combine_results_data(all_data,exclude_ids,results_df):
    data_excluded = all_data.loc[all_data['uid'].isin(exclude_ids)].reset_index(drop=True)
    text_map = {text:label for text,label in zip(results_df['text'],results_df['llm_label'])}
    data_excluded['llm_label'] = data_excluded['text'].apply(lambda x: text_map[x] if x in text_map.keys() else None).reset_index(drop=True)
    data_excluded = data_excluded.loc[~data_excluded['llm_label'].isna()].reset_index(drop=True)
    all_results = pd.concat([results_df,data_excluded]).reset_index(drop=True)
    return all_results


def make_preds(query,tag):
    # Load in main data
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt(query.title(), tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')

def make_preds_categories(query,tag):
    # Load in main data
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt_categories(tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')
    

# Creating pipeline workflow for fast processing 


args: query,tag

In [30]:
# Rename all files 
# Rename files in directory
import logging
import glob
def rename_files(tag):
    file_paths = glob.glob(f"../data/{tag}/*.csv")
    for file in file_paths:
        os.system(f" mv {file} {file.replace(tag.title()+'_','').lower()}")
        logging.info(f"Finished processing for {tag}")

In [31]:
def make_preds(query,tag):
    # Load in main data
    rename_files(tag)
    main_query_data = load_in_data(query)
    
    # Load in sampled data
    all_data, unique_text_data, excluded_uid_data, train_data, test_data = grab_specific_tag_data_breakdown_test_and_train(main_query_data,tag)
    
    # Condition for sample 
    if len(train_data)>5000:
        train_data = train_data.sample(n=5000,random_state=42)
        
    # Create prompt and make preds
    prompt = generate_prompt(query.title(), tag.title())
    results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), train_data, label_name='llm_label')
    
    print(f'The label distribution for {query} {tag} is: \n with singular results')
    print(results['llm_label'].value_counts()/len(results))
    
    # Combine text to label 
    all_results = combine_results_data(all_data,excluded_uid_data,results)
    
    print(f'The label distribution for {query} {tag} is: \n with all results')
    print(all_results['llm_label'].value_counts()/len(all_results))
    
    # Save data
    all_results.to_json(f'../data/predictions/{query}/{tag}.json',orient='records',lines=True)
    print('Complete')
    

In [None]:
glob.glob(f"../data/glasses/*.csv")[0].replace("Glasses_",'')

'../data/glasses/apple.csv'

In [29]:
# Rename all files 
# Rename files in directory
import logging
import glob
def rename_files(tag):
    file_paths = glob.glob(f"../data/{tag}/*.csv")
    for file in file_paths:
        os.system(f" mv {file} {file.replace(tag.title()+'_','').lower()}")
        logging.info(f"Finished processing for {tag}")

    


        

In [12]:
for file in glob.glob(f"../data/categories/*.csv"):
    os.system(f" mv {file} {file.replace('category','categories')} ")

In [19]:
data = pd.read_csv(f"../data/glasses/apple.csv",header=10)

# Glasses 

In [50]:
# Glasses 
# Go do and make preds for all

#tags = ['meta','rokid','snap','vuzix','xreal','google','apple','lenovo']
tags = ['xreal','google','apple','lenovo']
querys = ['glasses' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    print(query,tag)
    make_preds(query,tag)

glasses meta
['/Users/pvacca/git/project-google-arxr-analytics/data/glasses/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/lenovo.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/snap.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/vuzix.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/rokid.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/meta.csv']


  df = pd.read_csv(path,header=10)
100%|██████████| 5000/5000 [08:55<00:00,  9.33it/s]


The label distribution for glasses meta is: 
 with singular results
llm_label
0    0.755779
1    0.162814
2    0.081407
Name: count, dtype: float64
The label distribution for glasses meta is: 
 with all results
llm_label
0.0    0.758194
1.0    0.161138
2.0    0.080667
Name: count, dtype: float64
Complete
glasses rokid
['/Users/pvacca/git/project-google-arxr-analytics/data/glasses/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/lenovo.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/snap.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/vuzix.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/rokid.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/meta.csv']


  df = pd.read_csv(path,header=10)
100%|██████████| 1279/1279 [02:16<00:00,  9.35it/s]


The label distribution for glasses rokid is: 
 with singular results
llm_label
0    0.752941
2    0.123922
1    0.123137
Name: count, dtype: float64
The label distribution for glasses rokid is: 
 with all results
llm_label
0.0    0.756744
2.0    0.165277
1.0    0.077979
Name: count, dtype: float64
Complete
glasses snap
['/Users/pvacca/git/project-google-arxr-analytics/data/glasses/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/lenovo.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/snap.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/vuzix.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/rokid.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/meta.csv']


  df = pd.read_csv(path,header=10)
100%|██████████| 440/440 [00:47<00:00,  9.33it/s]


The label distribution for glasses snap is: 
 with singular results
llm_label
0    0.779343
2    0.112676
1    0.107981
Name: count, dtype: float64
The label distribution for glasses snap is: 
 with all results
llm_label
0.0    0.785877
2.0    0.109339
1.0    0.104784
Name: count, dtype: float64
Complete
glasses vuzix
['/Users/pvacca/git/project-google-arxr-analytics/data/glasses/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/lenovo.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/snap.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/vuzix.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/rokid.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/meta.csv']


  df = pd.read_csv(path,header=10)
100%|██████████| 168/168 [00:17<00:00,  9.39it/s]


The label distribution for glasses vuzix is: 
 with singular results
llm_label
0    0.648810
2    0.244048
1    0.107143
Name: count, dtype: float64
The label distribution for glasses vuzix is: 
 with all results
llm_label
0.0    0.613260
2.0    0.226519
1.0    0.160221
Name: count, dtype: float64
Complete
glasses xreal
['/Users/pvacca/git/project-google-arxr-analytics/data/glasses/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/lenovo.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/snap.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/vuzix.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/rokid.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/glasses/meta.csv']


  df = pd.read_csv(path,header=10)


ValueError: a must be greater than 0 unless no samples are taken

# Load in all glasses data 

In [52]:
glasses_predictions = glob.glob('../data/predictions/glasses/*')
data = []
for path in glasses_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_glasses_preds = pd.concat(data).reset_index(drop=True)
all_glasses_preds['query'] = 'glasses'
all_glasses_preds.to_json('../data/predictions/final_preds/glasses.json',orient='records',lines=True)

# Data Quality Check 

In [149]:
all_glasses_preds.loc[(all_glasses_preds['tag']=='google') & (all_glasses_preds['llm_label']==1) ][['text']]

Unnamed: 0,text
22669,RT @steepler @bexhillmuseum @WollastonMuseum I did a Google glass search on it and that’s the most likely thing.
22673,"@GoogleARVR @peregrau1969 @unity Google augmented reality glasses project: These are four links about mixed reality technologies. Therefore, we ask engineers and workers to conduct research in order to take advantage of all technologies and innovations and create hologram projects &"
22674,RT @verge Google Glass Enterprise Edition is no more https://t.co/EiKr87RM72 https://t.co/zO3dkonuT9
22677,"@MKBHD The same basic idea of google glass, but covering the whole face..."
22679,"@tnatw Use of chat with Google glasses, for jury selection to read micro expressions. Then constantly redirecting chat throughout the trial, to cater questioning and objections to the flow of information. Smarter lawyers will be a must, parameters will change for every trial."
...,...
29915,Google Glass
29916,Google Glass.
29917,Google Glass.
29918,Google Glass


# Headsets 

In [51]:
tags = ['apple',
 'dpvr',
 'google',
 'hp',
 'htc',
 'magic_leap',
 'meta',
 'microsoft',
 'oppo',
 'pico',
 'samsung',
 'sony',
 'valve']
querys = ['headsets' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    make_preds(query,tag)


['/Users/pvacca/git/project-google-arxr-analytics/data/headsets/magic_leap.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/oppo.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/sony.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/dpvr.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/hp.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/pico.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/meta.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/microsoft.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/samsung.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/headsets/htc.csv']


  df = pd.read_csv(path,header=10)
  df = pd.read_csv(path,header=10)
  df = pd.read_csv(path,header=10)
  df = pd.read_csv(path,header=10)
  df = pd.read_csv(path,header=10)
100%|██████████| 5000/5000 [09:01<00:00,  9.23it/s]


# Categories 

### Unlike the other queries we need to adjust the prompt---using Gemini we'll optimize the prompt for best results based on true labels


In [35]:
#rename_files('categories')
cat_data = load_in_data('categories')

['/Users/pvacca/git/project-google-arxr-analytics/data/categories/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/amazon.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/meta.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/microsoft.csv']


  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')


['/Users/pvacca/git/project-google-arxr-analytics/data/categories/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/amazon.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/meta.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/microsoft.csv']


  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')


['/Users/pvacca/git/project-google-arxr-analytics/data/categories/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/amazon.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/meta.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/microsoft.csv']


  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')
  df['tag'] = path.split('/')[-1].split('.')[0].replace(f'{query}_','')


In [36]:
df,df_unique_snippets,excluded_uid_from_unique, df_train, df_test = grab_specific_tag_data_breakdown_test_and_train(cat_data,'apple')

In [42]:
df['source'].unique()

array(['manual_export'], dtype=object)

In [39]:
df_test.to_csv('../data/categories_test_data/apple_test_data.csv')

In [45]:
# Step 1 run results with gemini 
prompt = generate_prompt_categories('Apple')
results = run_llm_label_flow_gemini(sentiment_dict, prompt, list(sentiment_dict.keys()), df_test, label_name='llm_label')

100%|██████████| 50/50 [00:05<00:00,  9.48it/s]


In [46]:
results.to_csv('../data/categories_test_data/apple_test_data.csv')

In [47]:
df['pageType'].unique()

KeyError: 'pageType'

In [48]:
tags = ['amazon','apple','google','meta','microsoft']
querys = ['categories' for i in range(len(tags))]

for query,tag in zip(querys,tags):
    make_preds_categories(query,tag)

['/Users/pvacca/git/project-google-arxr-analytics/data/categories/google.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/amazon.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/apple.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/meta.csv', '/Users/pvacca/git/project-google-arxr-analytics/data/categories/microsoft.csv']


  df = pd.read_csv(path,header=10)
  df = pd.read_csv(path,header=10)
  df = pd.read_csv(path,header=10)
  df = pd.read_csv(path,header=10)
100%|██████████| 4950/4950 [08:45<00:00,  9.42it/s]


The label distribution for categories amazon is: 
 with singular results
llm_label
0    0.621141
1    0.348199
2    0.030660
Name: count, dtype: float64
The label distribution for categories amazon is: 
 with all results
llm_label
0.0    0.616724
1.0    0.355295
2.0    0.027981
Name: count, dtype: float64


OSError: Cannot save file into a non-existent directory: '../data/predictions/categories'

# Load in final preds data 

In [6]:
headsets_predictions = glob.glob('../data/predictions/headsets/*')
data = []
for path in headsets_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_headsets_preds = pd.concat(data).reset_index(drop=True)
all_headsets_preds['query'] = 'headsets'
all_headsets_preds.to_json('../data/predictions/final_preds/headsets.json',orient='records',lines=True)

In [7]:
categories_predictions = glob.glob('../data/predictions/categories/*')
data = []
for path in categories_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_cat_preds = pd.concat(data).reset_index(drop=True)
all_cat_preds['query'] = 'categories'
all_cat_preds.to_json('../data/predictions/final_preds/categories.json',orient='records',lines=True)

# Combine ALL predictions into one single dataframe 

In [8]:
all_predictions = glob.glob('../data/predictions/final_preds/*')
data = []
for path in all_predictions:
    df = pd.read_json(path,orient='records',lines=True)
    data.append(df)
all_preds = pd.concat(data).reset_index(drop=True)
all_preds.to_json('../data/predictions/final_preds/all_preds.json',orient='records',lines=True)