In [1]:
from datetime import datetime
import os, sys
import logging
# logging.basicConfig(level=logging.INFO)
# log = logging.getLogger(os.path.basename(__file__))

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer, BitsAndBytesConfig
from sentence_transformers import SentenceTransformer
import pyarrow.parquet as pq
from pinecone import Pinecone, PodSpec

### Lab on Milestone 2:

In [2]:
DATA_FILE = 'data/train-1-of-2.parquet'

PINECONE_API_KEY = "4eb3035e-8ef3-41cb-b108-28b8b7e250b9"
PINECONE_INDEX_NAME = "rag4fin"

GEN_MODEL = '/home/cdsw/models/Mistral-7B-Instruct-v0.2'
EMB_MODEL = '/home/cdsw/models/all-mpnet-base-v2'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, model_max_length=8192)
tokenizer.pad_token = tokenizer.eos_token

In [4]:
chat = [
  {"role": "user", "content": "Hello, how are you?"},
  {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
  {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

tokenizer.apply_chat_template(chat, tokenize=False)

"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s>[INST] I'd like to show off how chat templating works! [/INST]"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16
)

gen_model = AutoModelForCausalLM.from_pretrained(
    GEN_MODEL,
    quantization_config=bnb_config,
    use_cache=True, # False when training
    # do_sample=True,
    device_map='auto'
)

In [6]:
gen_config = gen_model.generation_config
gen_config.pad_token_id=tokenizer.eos_token_id
gen_config

GenerationConfig {
  "bos_token_id": 1,
  "eos_token_id": 2,
  "pad_token_id": 2
}

In [7]:
class Retriever:
    def __init__(self):
        logging.info('Loading embedding model %s ... ' % EMB_MODEL)
        self.model = SentenceTransformer(EMB_MODEL)
        pc = Pinecone(api_key=PINECONE_API_KEY)
        self.index = pc.Index(PINECONE_INDEX_NAME)
        # self.texts = self.load_texts()
        self.df = pq.read_table(DATA_FILE).to_pandas()

    # def load_texts(self):
    #     with open(DATA_FILE) as f:
    #         lines = f.read().split('\n')
    #     return lines

    def text_id(self, user_input, top_k=3):
        query = self.model.encode([user_input])[0].tolist()
        results = self.index.query(vector=query, top_k=top_k, include_values=True)['matches']
        return [int(r['id'].split('-')[1]) for r in results]

    def get_texts(self, user_input, top_k=3):
        text_ids = self.text_id(user_input, top_k)
        titles, texts = self.df['title'], self.df['text']
        return [(id, titles[id], texts[id]) for id in text_ids]

In [8]:
retriever = Retriever()
retriever.df

Unnamed: 0,title,text,url
0,BRIEF-Bigger Capital Fund Reports An 8 Pct Pas...,"January 2, 2018 / 9:31 PM / Updated 8 minutes ...",https://www.reuters.com/article/brief-bigger-c...
1,Global Markets: Asia shares reach decade top o...,NEW YORK (Reuters) - European stocks closed lo...,https://in.reuters.com/article/global-markets/...
2,Donald Trump is the only person in Washington ...,Fears of a government shutdown coursed through...,https://www.cnbc.com/2018/01/18/donald-trump-t...
3,Actor Casey Affleck withdraws as 2018 Oscar pr...,03 PM / Updated 19 minutes ago Actor Casey Af...,https://www.reuters.com/article/us-oscars-case...
4,EU mulls new link between budget and civic rights,"January 22, 2018 / 7:23 PM / Updated 2 hours a...",https://uk.reuters.com/article/uk-eu-poland-bu...
...,...,...,...
153116,Billie Jean King: International Women's Day sh...,"Following the widespread adoption of the ""Me T...",https://www.cnbc.com/2018/03/08/billie-jean-ki...
153117,Lamb Weston Announces Details of Fiscal 2018 T...,"EAGLE, Idaho--(BUSINESS WIRE)-- Lamb Weston Ho...",http://www.cnbc.com/2018/03/16/business-wire-l...
153118,China says political education for students no...,"March 16, 2018 / 5:03 AM / Updated 2 hours ago...",https://uk.reuters.com/article/uk-china-parlia...
153119,"Blue Cross, Lyft, Walgreens and CVS partner to...","Blue Cross, Lyft, Walgreens and CVS partner to...",https://www.cnbc.com/2018/03/14/blue-cross-lyf...


In [9]:
query = 'What is the revenue of Microsoft in game?'
texts = retriever.get_texts(query, top_k=5)

for id, title, text in texts:
    print(id, title)
    print(text[:100])
    print('=' * 30)

66310 Microsoft beats on bottom line
Microsoft beats on bottom line 1 Hour Ago 02:26 02:26 | 8 Hrs Ago 03:14 03:14 | 10:36 AM ET Tue, 30 
83809 Microsoft Second Quarter Earnings: Xbox, Cortana, and Aquisitions | Fortune
By Jonathan Vanian 8:54 PM EST 
Microsoft’s latest quarterly sales results beat Wall Street’s expect
97787 BRIEF-CI Games Prelim FY Revenue At About 100.5 Million Zlotys
Feb 16 (Reuters) - CI GAMES SA:
* PRELIM FY REVENUE AT ABOUT 100.5 MILLION ZLOTYS Source text for Ei
75550 Microsoft reports better-than-expected quarterly revenue, profit
January 31, 2018 / 9:19 PM / Updated 21 minutes ago Microsoft's cloud computing business grows, stoc
85162 BRIEF-Gold Town Games January Net Revenue SEK 1.3 Mln
Feb 19 (Reuters) - Gold Town Games Ab:
* NET REVENUE IN JANUARY AT SEK 1.3 MILLION Source text for E


In [10]:
prompt_template = '%s\n%s Please answer with no more than 100 words.'
id, title, text = texts[0]

In [11]:
prompt = prompt_template % (text, query)

print('Tokenizing ...')
input_ids = tokenizer([prompt], return_tensors="pt", truncation=True).to('cuda')
streamer = TextStreamer(tokenizer, skip_prompt=True)
print('Start inference:', str(datetime.now()))
_ = gen_model.generate(
    **input_ids,
    streamer=streamer,
    max_new_tokens=200,
    do_sample=False,
    # top_p=0.9,
    # temperature=0.1,
    # early_stopping=True,
    generation_config=gen_config,
)
print('Completed:', str(datetime.now()))

Tokenizing ...
Start inference: 2024-04-14 09:43:48.169931
Microsoft's gaming revenue comes primarily from the sales of its Xbox consoles and games, as well as from its subscription service, Xbox Live. In the most recent quarter, gaming revenue was $3.2 billion, up 32% year over year. This growth was driven by strong sales of Xbox One consoles and the continued growth of Xbox Live subscriptions.</s>
Completed: 2024-04-14 09:43:54.371825


#### Stop criteria

In [12]:
query = 'What do you think about Microsoft and Nvidia?'
texts = retriever.get_texts(query, top_k=3)

for id, title, text in texts:
    print(id, title)
    print(text[:100])
    print('=' * 30)

90569 Nvidia’s Cryptic Road Ahead
0 COMMENTS Nvidia NVDA 6.69% may have only high-quality problems these days, but it still needs to s
37761 Nvidia rallies on driverless car partnership announcement
Nvidia rallies on driverless car partnership announcement 4 Hours Ago The "Squawk on the Street" cre
10590 Why Ethereum's Price Is Still a Boon for Nvidia and AMD | Fortune
Ethereum How Ethereum Is Boosting Nvidia and AMD Sebastian Kastner, lead engineer at HydroMiner GmbH


In [13]:
prompt_template = '%s\n%s Please answer with no more than 150 words.'
text = '\n'.join([text for _, _, text in texts])
prompt = prompt_template % (text, query)
print(prompt[-500:])

rency mining. Higher GPU prices also lower profitability for miners. 
“We are raising estimates meaningfully for both graphics vendors, but sustainability is a long-term issue especially for AMD,” Morgan Stanley analysts wrote Tuesday. 
Ethereum prices were relatively flat at $1,000 while Bitcoin rose to $11,200, up 4%, on Tuesday. Ethereum rose close to $1,400 earlier this year. SPONSORED FINANCIAL CONTENT 
What do you think about Microsoft and Nvidia? Please answer with no more than 150 words.


In [14]:
tokenizer = AutoTokenizer.from_pretrained(GEN_MODEL, model_max_length=8192)
tokenizer.pad_token = tokenizer.eos_token

In [15]:
from transformers.generation.logits_process import LogitsProcessor, LogitsProcessorList

class EosLogitsProcessor(LogitsProcessor):
    def __init__(self):
        self.stop_rate = None
        self.newline_id = 13
        
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        eos = tokenizer.eos_token_id
        if scores[0].argmax().item() != self.newline_id and self.stop_rate is None:
            self.stop_rate = 1.0
        if self.stop_rate is not None:
            if scores[0].argmax().item() == self.newline_id:
                self.stop_rate *= 1.1
            scores[:, eos] = scores[:, eos] * self.stop_rate

        # values, indexes = scores.topk(3)
        # for val, idx in zip(values.view(-1), indexes.view(-1)):
        #     tok = tokenizer.convert_ids_to_tokens(idx.item())
        #     logit = val.item()
        #     print('%s(%s)' % (tok, logit), end=' ')
        # print(scores[0, eos], self.stop_rate)
        
        return scores
    
logits_processor = LogitsProcessorList()
logits_processor.append(EosLogitsProcessor())

In [16]:
input_ids = tokenizer(prompt, return_tensors="pt", truncation=True).input_ids.cuda()
streamer = TextStreamer(tokenizer, skip_prompt=True)
_ = gen_model.generate(
    input_ids,
    streamer=streamer,
    logits_processor=logits_processor,
    max_new_tokens=500,
    do_sample=False,
)

# outputs = gen_model.generate(
#     input_ids,
#     logits_processor=logits_processor,
#     max_new_tokens=200,
#     do_sample=False,
# )
# res = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):]
# print(res)


Microsoft and Nvidia have a long-standing partnership, with Microsoft using Nvidia GPUs in its Azure cloud platform and Nvidia using Microsoft’s Azure for its own AI research. The partnership was further strengthened in 2017 when Microsoft announced that it would be using Nvidia GPUs in its new data centers, and Nvidia announced that it would be using Microsoft’s Azure for its own AI research. The partnership is expected to continue to grow in the future, with both companies investing in AI and machine learning technologies. Microsoft and Nvidia are also working together on autonomous vehicles, with Nvidia providing the AI computing platform and Microsoft providing the mapping and location data. The partnership is a win-win for both companies, with Microsoft gaining access to Nvidia’s advanced GPUs and Nvidia gaining access to Microsoft’s vast data and cloud resources.</s>


### Lab on Milestone 3:

In [5]:
from transformers import AutoTokenizer, AutoModel

CHAT_MODEL = '/home/cdsw/models/selfrag_llama2_7b'
EMB_MODEL = '/home/cdsw/models/contriever-msmarco'

#### Facebook Contriever and Embedding

In [18]:
tokenizer = AutoTokenizer.from_pretrained(EMB_MODEL)
model = AutoModel.from_pretrained(EMB_MODEL)

In [19]:
type(tokenizer), type(model), model

(transformers.models.bert.tokenization_bert_fast.BertTokenizerFast,
 transformers.models.bert.modeling_bert.BertModel,
 BertModel(
   (embeddings): BertEmbeddings(
     (word_embeddings): Embedding(30522, 768, padding_idx=0)
     (position_embeddings): Embedding(512, 768)
     (token_type_embeddings): Embedding(2, 768)
     (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
     (dropout): Dropout(p=0.1, inplace=False)
   )
   (encoder): BertEncoder(
     (layer): ModuleList(
       (0-11): 12 x BertLayer(
         (attention): BertAttention(
           (self): BertSelfAttention(
             (query): Linear(in_features=768, out_features=768, bias=True)
             (key): Linear(in_features=768, out_features=768, bias=True)
             (value): Linear(in_features=768, out_features=768, bias=True)
             (dropout): Dropout(p=0.1, inplace=False)
           )
           (output): BertSelfOutput(
             (dense): Linear(in_features=768, out_features=768, bias=T

In [20]:
sentences = [
    "Where was Marie Curie born?",
    "Maria Sklodowska, later known as Marie Curie, was born on November 7, 1867.",
    "Born in Paris on 15 May 1859, Pierre Curie was the son of Eugène Curie, a doctor of French Catholic origin from Alsace."
]

# Apply tokenizer
inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

# Compute token embeddings
outputs = model(**inputs)

In [21]:
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1066,  0.0164,  0.0547,  ..., -0.0065, -0.0631, -0.0280],
         [ 0.0192, -0.0602,  0.0502,  ...,  0.1185, -0.0204, -0.0111],
         [ 0.1103, -0.0207,  0.0044,  ...,  0.0599, -0.0432,  0.0183],
         ...,
         [ 0.0305, -0.2994, -0.0895,  ...,  0.2829,  0.0329,  0.0951],
         [ 0.0298, -0.3350, -0.0862,  ...,  0.2983,  0.0351,  0.0803],
         [ 0.0299, -0.2928, -0.0951,  ...,  0.2991,  0.0265,  0.0623]],

        [[ 0.0799,  0.0201,  0.0418,  ...,  0.0752,  0.0130,  0.0336],
         [-0.0250,  0.0173,  0.0594,  ...,  0.1173, -0.1175,  0.0543],
         [ 0.0280,  0.0259, -0.0916,  ..., -0.1142, -0.0608,  0.1254],
         ...,
         [ 0.0315, -0.1623, -0.0734,  ...,  0.1735, -0.0453,  0.0775],
         [ 0.0674, -0.1745, -0.0764,  ...,  0.1819, -0.0478,  0.0687],
         [ 0.0397, -0.1430, -0.0823,  ...,  0.1684, -0.0414,  0.0401]],

        [[ 0.0364, -0.0647,  0.0651,  ...,  0.0903,  

In [22]:
mask = inputs['attention_mask']
# type(mask), mask, ~mask[..., None]
mask[..., None].shape, mask[..., None][0]

(torch.Size([3, 30, 1]),
 tensor([[1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0],
         [0]]))

In [23]:
# Mean pooling
def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

In [24]:
embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
embeddings

tensor([[ 0.0161,  0.0055,  0.0199,  ...,  0.0372, -0.0831, -0.0112],
        [ 0.0037,  0.0346, -0.0131,  ...,  0.0247, -0.1021, -0.0303],
        [-0.0146, -0.0235, -0.0338,  ...,  0.0277, -0.0025, -0.0092]],
       grad_fn=<DivBackward0>)

#### Generation

In [3]:
def format_prompt(input, paragraph=None):
    prompt = "### Instruction:\n{0}\n\n### Response:\n".format(input)
    if paragraph is not None:
        prompt += "[Retrieval]<paragraph>{0}</paragraph>".format(paragraph)
    return prompt

In [26]:
queries = [
    "Leave odd one out: twitter, instagram, whatsapp.",
    "Can you tell me the difference between llamas and alpacas?"
]

prompt = format_prompt(queries[0])
print(prompt)

### Instruction:
Leave odd one out: twitter, instagram, whatsapp.

### Response:



In [6]:
tokenizer = AutoTokenizer.from_pretrained(CHAT_MODEL, model_max_length=8192)
tokenizer.pad_token = tokenizer.eos_token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
chat_model = AutoModelForCausalLM.from_pretrained(
    CHAT_MODEL,
    quantization_config=bnb_config,
    use_cache=True, # False when training
    # do_sample=True,
    device_map='auto'
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
chat_config = chat_model.generation_config
chat_config.pad_token_id=tokenizer.eos_token_id
chat_config

GenerationConfig {
  "bos_token_id": 1,
  "do_sample": true,
  "eos_token_id": 2,
  "max_length": 4096,
  "pad_token_id": 2,
  "temperature": 0.6,
  "top_p": 0.9
}

In [29]:
prompt = format_prompt(queries[1])

print('Tokenizing ...')
input_ids = tokenizer([prompt], return_tensors="pt", truncation=True).to('cuda')
streamer = TextStreamer(tokenizer, skip_prompt=True)
print('Start inference:', str(datetime.now()))
_ = chat_model.generate(
    **input_ids,
    streamer=streamer,
    max_new_tokens=200,
    do_sample=False,
    # top_p=0.9,
    # temperature=0.1,
    # early_stopping=True,
    generation_config=gen_config,
)
print('Completed:', str(datetime.now()))

Tokenizing ...
Start inference: 2024-04-14 09:45:10.511304
Sure![Retrieval]<paragraph>[Irrelevant]Here are some key differences between llamas and alpacas:

1.Appearance:2.3.4.5.[Utility:5]</s>
Completed: 2024-04-14 09:45:13.606554


In [14]:
def generate_with_paragraph(prompt, paragraph=None, max_new_tokens=200):
    prompt = format_prompt(prompt, paragraph)
    print(prompt)
    print('\nTokenizing ...\n')
    input_ids = tokenizer([prompt], return_tensors="pt", truncation=True).to('cuda')
    streamer = TextStreamer(tokenizer, skip_prompt=True)
    print('Start inference:', str(datetime.now()))
    _ = chat_model.generate(
        **input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=False,
        # top_p=0.9,
        # temperature=0.1,
        # early_stopping=True,
        generation_config=chat_config,
    )
    print('Completed:', str(datetime.now()))

In [31]:
generate_with_paragraph(
    "Can you tell me the difference between llamas and alpacas?",
    "The alpaca (Lama pacos) is a species of South American camelid mammal. It is similar to, and often confused with, the llama. Alpacas are considerably smaller than llamas, and unlike llamas, they were not bred to be working animals, but were bred specifically for their fiber."
)

### Instruction:
Can you tell me the difference between llamas and alpacas?

### Response:
[Retrieval]<paragraph>The alpaca (Lama pacos) is a species of South American camelid mammal. It is similar to, and often confused with, the llama. Alpacas are considerably smaller than llamas, and unlike llamas, they were not bred to be working animals, but were bred specifically for their fiber.</paragraph>

Tokenizing ...

Start inference: 2024-04-14 09:45:13.693801
[Relevant]Alpacas are considerably smaller than llamas, and unlike llamas, they were not bred to be working animals, but were bred specifically for their fiber.[Fully supported][Utility:5]</s>
Completed: 2024-04-14 09:45:17.029711


In [32]:
generate_with_paragraph(
    'Does Cloudera CDP Base 7.1.7 support the REPL command?',
    'If you want to use REPL commands to replicate Hive ACID tables between CDP Private Cloud Base clusters, ensure that your source cluster is on CDP Private Cloud Base 7.1.8 or a higher version.'
)

### Instruction:
Does Cloudera CDP Base 7.1.7 support the REPL command?

### Response:
[Retrieval]<paragraph>If you want to use REPL commands to replicate Hive ACID tables between CDP Private Cloud Base clusters, ensure that your source cluster is on CDP Private Cloud Base 7.1.8 or a higher version.</paragraph>

Tokenizing ...

Start inference: 2024-04-14 09:45:17.034353
[Relevant]Yes, Cloudera CDP Base 7.1.7 supports the REPL command.[No support / Contradictory][Utility:5]</s>
Completed: 2024-04-14 09:45:19.391306


In [33]:
generate_with_paragraph(
    'What is the difference between Cloudera CDP Base 7.1.7 and 7.1.9?'
)

### Instruction:
What is the difference between Cloudera CDP Base 7.1.7 and 7.1.9?

### Response:


Tokenizing ...

Start inference: 2024-04-14 09:45:19.395783
Here are some of the key differences:

1.[Retrieval]<paragraph>[Irrelevant]This includes the Apache Hadoop framework, Apache Spark, Apache Kafka, and other open-source technologies.

2.3.[Utility:5]</s>
Completed: 2024-04-14 09:45:22.770310


In [16]:
query = 'What is the difference between Cloudera CDP Base 7.1.7 and 7.1.9?'
paragraph = '''What's new in Cloudera Runtime 7.1.9
Understand the functionalities and improvements to features of components in Cloudera Runtime 7.1.9.
Open Data Lakehouse, powered by Apache Iceberg
CDP Private Cloud Base 7.1.9 delivers the hybrid Open Data Lakehouse providing the following benefits:
Open architecture
Cloudera’s Open Data Lakehouse, powered by Apache Iceberg is 100% open—open source, open standards based, and with wide community adoption. It can store multiple data formats and enables multiple engines to work on the same data.
Ease of adoption
By integrating Iceberg right into the Shared Data Experience (SDX) and Apache Ozone, Cloudera offers the easiest path to deploying a lakehouse. Additional capabilities like schema evolution, hidden partition, and more simplify data management for large datasets.'''

In [19]:
generate_with_paragraph(query, paragraph)

### Instruction:
What is Jim Brown's occupation?

### Response:
[Retrieval]<paragraph>Jim Ed Brown  James Edward Brown (April 1, 1934 – June 11, 2015) was an American country singer-songwriter who achieved fame in the 1950s with his two sisters as a member of the Browns. He later had a successful solo career from 1965 to 1974, followed by a string of major duet hits with fellow country music vocalist Helen Cornelius, through 1981. Brown was also the host of the "Country Music Greats Radio Show", a syndicated country music program from Nashville, Tennessee.</paragraph>

Tokenizing ...

Start inference: 2024-04-18 14:52:52.443217
[Relevant]Jim Brown is a former professional American football player and current actor.[Fully supported][Utility:5]</s>
Completed: 2024-04-18 14:52:54.647934


In [18]:
query = "What is Jim Brown's occupation?"
paragraph = '''Jim Ed Brown  James Edward Brown (April 1, 1934 – June 11, 2015) was an American country singer-songwriter who achieved fame in the 1950s with his two sisters as a member of the Browns. He later had a successful solo career from 1965 to 1974, followed by a string of major duet hits with fellow country music vocalist Helen Cornelius, through 1981. Brown was also the host of the "Country Music Greats Radio Show", a syndicated country music program from Nashville, Tennessee.'''

In [17]:
generate_with_paragraph(query, paragraph)

### Instruction:
What is the difference between Cloudera CDP Base 7.1.7 and 7.1.9?

### Response:
[Retrieval]<paragraph>What's new in Cloudera Runtime 7.1.9
Understand the functionalities and improvements to features of components in Cloudera Runtime 7.1.9.
Open Data Lakehouse, powered by Apache Iceberg
CDP Private Cloud Base 7.1.9 delivers the hybrid Open Data Lakehouse providing the following benefits:
Open architecture
Cloudera’s Open Data Lakehouse, powered by Apache Iceberg is 100% open—open source, open standards based, and with wide community adoption. It can store multiple data formats and enables multiple engines to work on the same data.
Ease of adoption
By integrating Iceberg right into the Shared Data Experience (SDX) and Apache Ozone, Cloudera offers the easiest path to deploying a lakehouse. Additional capabilities like schema evolution, hidden partition, and more simplify data management for large datasets.</paragraph>

Tokenizing ...

Start inference: 2024-04-18 14:51:2