In [2]:
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import random

# Predefined list of meaningful sentences
sentences = [
    "The cat sat on the mat.",
    "A dog chased the cat.",
    "The sun rises in the east.",
    "She read a book near the cat.",
    "She played with a red ball sun.",
]

# Function to generate text for each document
def generate_text(sentence_list, num_sentences=1):
    """Generate text by randomly selecting sentences."""
    return " ".join(random.choices(sentence_list, k=num_sentences))

# Generate sample data
num_documents = 5  # Number of documents
data = {
    "id": list(range(1, num_documents + 1)),  # Unique IDs for each document
    "title": [f"Doc{i}" for i in range(1, num_documents + 1)],  # Titles
    "text": [generate_text(sentences) for _ in range(num_documents)]  # Texts
}

# Create a DataFrame
df = pd.DataFrame(data)
df


Unnamed: 0,id,title,text
0,1,Doc1,She played with a red ball sun.
1,2,Doc2,A dog chased the cat.
2,3,Doc3,The sun rises in the east.
3,4,Doc4,A dog chased the cat.
4,5,Doc5,She played with a red ball sun.


In [3]:
# Ensure text length is less than 7 words (truncate if necessary)
df['text'] = df['text'].apply(lambda x: " ".join(x.split()[:6]))

# Convert DataFrame to PyArrow Table
table = pa.Table.from_pandas(df)

# Write the table to a Parquet file
pq.write_table(table, "test.parquet")

print("test.parquet file has been generated successfully.")

test.parquet file has been generated successfully.
