Data processing:

In [None]:
import json
import time
import torch
from transformers import pipeline
import numpy as np
# Initialize the zero-shot classification pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device = torch.device('cuda'))


In [1]:
#"./categories_3.txt"
# './unique_books_3.json'
def prepare_labels(file_name,num_labels):
  with open(file_name,"r") as cate:
      lines = cate.readlines()
      google_category_labels = []
      for line in lines:
          parts = line.strip()
          google_category_labels.append(parts)

      labels = google_category_labels[:num_labels] if num_labels <= len(google_category_labels) else google_category_labels[:]
      return labels


def prepare_text_data(file_name):
  with open(file_name, 'r') as f:
      data = json.load(f)
      print(len(data))
      google_text_data = [value for key, value in data.items()]
      google_text_data = [diction.get("google_description") for diction in google_text_data if diction]
      google_text_data = [x for x in google_text_data if x is not None and len(x)>0]
      print("the google text data is ",google_text_data)
      return google_text_data

def label_extraction(num_out,num_left,text_data,labels):
  round = 0
  while len(labels) > num_left and len(labels) > num_out:
    print("this is the ",round, " round.")
    time_start=time.time()
    scores = {}
    for label in labels:
      scores[label] = 0
    results = classifier(text_data, candidate_labels=labels, batch_size=128)
    for result in results:
      for (label, score) in zip(result['labels'], result['scores']):
        scores[label] += score
    end_time = time.time()
    elapsed_time = end_time - time_start
    print("------------------------------------------")
    print(f"Time has spent for {elapsed_time} seconds")
    # Debugging output: scores and labels
    scores_list = []
    for label in labels:
      scores_list.append(scores[label])
    scores = np.array(scores_list)
    
    # Indices of scores sorted in descending order (highest scores first)
    indices = np.argsort(-scores)

    # Eliminate the least relevant categories
    labels = np.array(labels)[indices[:-num_out]].tolist()
    # Print remaining labels after each elimination round
    print("Remaining Labels:", labels)
    round += 1

  # Final remaining labels after all eliminations
  print("Final Categories:", labels)



from transformers import pipeline

def summarize_text(input_text):
    if input_text:
      summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
      summary = summarizer(input_text, max_length=30, min_length=5, do_sample=False)
      return summary[0]['summary_text']
    else:
      return "no_content"


def classify_google(json_file_name,label_file_name,num_labels,num_out,num_left):
  labels = prepare_labels(label_file_name,num_labels)
  text_data = prepare_text_data(json_file_name)
  #text_data = [summarize_text(text) for text in text_data]
  final_labels = label_extraction(num_out,num_left,text_data,labels)
  return final_labels


#"./categories_3.txt"
# './unique_books_3.json'

json_file_name,label_file_name = 'google_09_10.json',"output_words_only.txt"
num_labels,num_out,num_left = 210,50,10

final_labels = classify_google(json_file_name,label_file_name,num_labels,num_out,num_left)

  from .autonotebook import tqdm as notebook_tqdm


90
this is the  0  round.
------------------------------------------
Time has spent for 243.16902995109558 seconds
Remaining Labels: ['thinking', 'tale', 'sequel', 'host', 'attention', 'drama', 'power', 'position', 'areas', 'leader', 'danger', 'characters', 'investigation', 'company', 'guide', 'action', 'number', 'home', 'work', 'family', 'fans', 'media', 'issues', 'rights', 'movement', 'lessons', 'variety', 'style', 'lead', 'support', 'player', 'truths', 'voice', 'groups', 'capital', 'success', 'world', 'heart', 'record', 'lives', 'program', 'decisions', 'region', 'advice', 'division', 'wit', 'brand', 'classic', 'producer', 'state', 'states', 'old', 'women', 'league', 'young', 'process', 'friend', 'man', 'word', 'boy', 'community', 'franchise', 'face', 'championship', 'majority', 'legacy', 'class', 'episode', 'research', 'activities', 'comedy', 'records', 'age', 'secrets', 'game', 'history', 'appearances', 'love', 'leadership', 'development', 'life', 'players', 'track', 'language', 'l

Trend Prediction:

In [2]:
instruction = "You're an expert in trend prediction, I want you to predict this year's trending word based on previous years'.\n"

examples = "Here is one example: \nTrend Prediction: top 10 most popular categories in previous years: media, classic, power, attention, groups, success, variety, style, guide, areas -> media\nExample end, now answer the question:\n"

question = "Trend Prediction: top 10 most popular categories in previous years: media, power, areas, classic, thinking, attention, leader, groups, company, variety -> "

text = instruction + examples + question

print(text)

You're an expert in trend prediction, I want you to predict this year's trending word based on previous years'.
Here is one example: 
Trend Prediction: top 10 most popular categories in previous years: media, classic, power, attention, groups, success, variety, style, guide, areas -> media
Example end, now answer the question:
Trend Prediction: top 10 most popular categories in previous years: media, power, areas, classic, thinking, attention, leader, groups, company, variety -> 


In [3]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='roberta-base')

In [4]:

unmasker(text+"<mask>")

[{'score': 0.42050355672836304,
  'token': 433,
  'token_str': ' media',
  'sequence': "You're an expert in trend prediction, I want you to predict this year's trending word based on previous years'.\nHere is one example: \nTrend Prediction: top 10 most popular categories in previous years: media, classic, power, attention, groups, success, variety, style, guide, areas -> media\nExample end, now answer the question:\nTrend Prediction: top 10 most popular categories in previous years: media, power, areas, classic, thinking, attention, leader, groups, company, variety -> media"},
 {'score': 0.021001208573579788,
  'token': 2474,
  'token_str': ' marketing',
  'sequence': "You're an expert in trend prediction, I want you to predict this year's trending word based on previous years'.\nHere is one example: \nTrend Prediction: top 10 most popular categories in previous years: media, classic, power, attention, groups, success, variety, style, guide, areas -> media\nExample end, now answer the