<a href="https://colab.research.google.com/github/abigailhaddad/ChatGPT_with_Python_for_shiny_docs/blob/master/cumulative_words_and_tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports and Functions

In [32]:
!pip install plotly pandas einops transformers torch

from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import torch.nn.functional as F
import pandas as pd
import plotly.express as px
import string
from collections import Counter

def load_huggingface_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    return model, tokenizer

def encode_prompt(tokenizer, prompt):
    return tokenizer.encode(prompt, return_tensors='pt')

def get_logits(model, input_ids):
    with torch.no_grad():
        outputs = model(input_ids, return_dict=True, output_attentions=False)
    return outputs.logits

def generate_response(model, input_ids, max_length=50):
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
    return outputs

def decode_response(tokenizer, response):
    return tokenizer.decode(response[0], skip_special_tokens=True)

def process_logits(logits, tokenizer, top_k=50):
    probs = F.softmax(logits, dim=-1)
    top_probs, top_indices = torch.topk(probs, top_k)
    top_tokens = [tokenizer.decode([idx]) for idx in top_indices[0]]
    return list(zip(top_tokens, top_probs[0].tolist()))

def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    return set(text.lower().split())

def collect_data(model, tokenizer, prompt, n_iterations, top_k_logits):
    data = []
    for i in range(n_iterations):
        input_ids = encode_prompt(tokenizer, prompt)
        logits = get_logits(model, input_ids)
        response_output = generate_response(model, input_ids)
        response = decode_response(tokenizer, response_output)
        top_tokens = process_logits(logits, tokenizer, top_k=top_k_logits)
        unique_words = clean_text(response)
        data.append({
            'Prompt': prompt,
            'Response': response,
            'Iteration': i + 1,
            'UniqueWords': unique_words,
            'Tokens': set(top_tokens)
        })
    return data

def calculate_cumulative_unique_words(data):
    cumulative_unique_words = []
    all_words = set()

    for row in data:
        all_words.update(row['UniqueWords'])
        cumulative_unique_words.append(len(all_words))

    return cumulative_unique_words

def calculate_cumulative_tokens_coverage(data):
    cumulative_tokens_covered = []
    all_covered_tokens = set()

    for row in data:
        tokens_covered = row['Tokens'].intersection(all_words)
        all_covered_tokens.update(tokens_covered)
        cumulative_tokens_covered.append(len(all_covered_tokens) / len(row['Tokens']))

    return cumulative_tokens_covered


def create_plot_unique_words(df):
    fig_words = px.line(df, x='Iteration', y='CumulativeUniqueWords', title='Cumulative Unique Words Over Iterations')
    return fig_words

def create_plot_tokens_coverage(df):
    fig_tokens = px.line(df, x='Iteration', y='CumulativeTokensCovered', title='Cumulative Tokens Coverage Over Iterations')
    return fig_tokens




## Getting the Model

In [2]:
model_name = "microsoft/phi-2"
model, tokenizer = load_huggingface_model(model_name)


tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/755 [00:00<?, ?B/s]

configuration_phi.py:   0%|          | 0.00/2.03k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- configuration_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi.py:   0%|          | 0.00/33.4k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:
- modeling_phi.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/24.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/577M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/69.0 [00:00<?, ?B/s]

## Running the Code

In [28]:
prompt = "What is the process of photosynthesis?"
n_iterations = 2
top_k_logits = 50
data = []

for i in range(n_iterations):
    input_ids = encode_prompt(tokenizer, prompt)
    print("here")
    response_output = generate_response(model, input_ids)
    print("there")
    response = decode_response(tokenizer, response_output)
    print("response")
    unique_words = clean_text(response)

    data.append({
            'Prompt': prompt,
            'Response': response,
            'Iteration': i + 1,
            'UniqueWords': unique_words,
        })

here
there
response
here
there
response


array([{'light', 'occurs', 'chemical', 'cells', 'plant', 'some', 'algae', 'photosynthesis', 'which', 'solution', 'process', 'bacteria', 'the', 'form', 'plants', 'of', 'is', 'glucose', 'chloroplasts', 'energy', 'what', 'by', 'in', 'convert', 'into', 'and', 'it'},
       {'light', 'occurs', 'chemical', 'cells', 'plant', 'some', 'algae', 'photosynthesis', 'which', 'solution', 'process', 'bacteria', 'the', 'form', 'plants', 'of', 'is', 'glucose', 'chloroplasts', 'energy', 'what', 'by', 'in', 'convert', 'into', 'and', 'it'}],
      dtype=object)

In [34]:
df=pd.DataFrame(data)
cumulative_unique_words = calculate_cumulative_unique_words(data)
df['CumulativeUniqueWords'] = cumulative_unique_words
# Create and Show Plots
fig_words = create_plot_unique_words(df)
fig_words.show()

In [24]:
logits[0][0][0]
top_k = 50
probs = F.softmax(logits, dim=-1)
top_probs, top_indices = torch.topk(probs, top_k)
top_tokens = [tokenizer.decode([idx]) for idx in top_indices[0]]

tensor([[[3.3427e-01, 1.4789e-01, 4.3111e-02, 4.2719e-02, 3.5502e-02,
          3.2048e-02, 3.1531e-02, 2.5469e-02, 1.8263e-02, 1.7655e-02,
          1.1661e-02, 1.0918e-02, 1.0302e-02, 8.9681e-03, 8.7401e-03,
          8.7337e-03, 8.4764e-03, 7.9811e-03, 7.6864e-03, 6.6543e-03,
          6.2434e-03, 6.2003e-03, 5.6525e-03, 5.4469e-03, 5.3056e-03,
          4.9302e-03, 4.7400e-03, 4.4468e-03, 4.2118e-03, 4.0291e-03,
          3.3659e-03, 3.3146e-03, 3.0755e-03, 2.3332e-03, 1.9459e-03,
          1.8016e-03, 1.7752e-03, 1.7748e-03, 1.6478e-03, 1.5621e-03,
          1.4745e-03, 1.4336e-03, 1.2490e-03, 1.1509e-03, 1.1465e-03,
          1.0918e-03, 1.0691e-03, 9.9582e-04, 9.6323e-04, 8.6686e-04],
         [5.5273e-01, 7.2747e-02, 2.3721e-02, 1.2299e-02, 8.9810e-03,
          8.2914e-03, 3.2131e-03, 2.2791e-03, 2.1655e-03, 2.1507e-03,
          2.1275e-03, 1.8822e-03, 1.7277e-03, 1.6439e-03, 1.5787e-03,
          1.5133e-03, 1.1791e-03, 1.1363e-03, 1.0844e-03, 1.0644e-03,
          1.0614e-0