Parte 1 - Carregar e preparar os dados

In [None]:
#Conexão com o Google Drive

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import json
import pandas as pd
import re

In [None]:
# Lista para armazenar todos os dados processados
json_new = []

# Processar dados do arquivo JSON Lines do dataset complementar
with open(r'/content/drive/MyDrive/Colab Notebooks/trn.json', 'r', encoding='utf-8') as file:
    json_new = []

    for line in file:
      try:
        item = json.loads(line)
        new_item = {
            "title": item["title"],
            "content": item["content"]
        }
        json_new.append(new_item)
      except json.JSONDecodeError as e:
       print(f"Skipping invalid line: {line.strip()} due to error: {e}")

df = pd.DataFrame(json_new)
print(df.head())

Skipping invalid line: {"uid": "B005VLMEN4", "title": "", "content": "The GLX 104 series has a destinct copter sytle look with an lightweight ABS thermoplastic resign shell, plush interior and a unique shield and base plate making it one of a kind. A removable neck curtain has also been added for all riding conditions and better comfort in warm and cool climates. It is available in 8 different color options in s due to error: Unterminated string starting at: line 1 column 47 (char 46)
                                               title  \
0                        Girls Ballet Tutu Neon Pink   
1                           Adult Ballet Tutu Yellow   
2  The Way Things Work: An Illustrated Encycloped...   
3                                      Mog's Kittens   
4                              Misty of Chincoteague   

                                             content  
0  High quality 3 layer ballet tutu. 12 inches in...  
1                                                     
2       

In [None]:
df.shape

(1499649, 2)

In [None]:
df.head(50)

Unnamed: 0,title,content
0,Girls Ballet Tutu Neon Pink,High quality 3 layer ballet tutu. 12 inches in...
1,Adult Ballet Tutu Yellow,
2,The Way Things Work: An Illustrated Encycloped...,
3,Mog's Kittens,Judith Kerr&#8217;s best&#8211;selling adventu...
4,Misty of Chincoteague,
5,Hilda Boswell's treasury of children's stories...,
6,The Simple Truths of Service: Inspired by John...,
7,Girls Ballet Tutu Neon Blue,Dance tutu for girls ages 2-8 years. Perfect f...
8,Evaluating Research in Academic Journals - A P...,
9,Dr. Seuss ABC (Dr.Seuss Classic Collection) (S...,


In [None]:
# Remover linhas com valores nulos em 'title' ou 'content'
df = df[(df['title'].str.strip() != '') & (df['content'].str.strip() != '')]

print('Removida as linhas com valores nulos em title ou content')

# Remover entradas duplicadas
df = df.drop_duplicates(subset=['title', 'content'])
print('Removida entradas duplicadas')

# Remover caracteres especiais
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.strip().lower())
    return text
print('Removido os caracteres especiais')

# Aplicar função de limpeza
df['title'] = df['title'].apply(clean_text)
df['content'] = df['content'].apply(clean_text)
print('Aplicada função de limpeza')

# Remover descrições muito curtas
df = df[df['content'].str.split().str.len() > 5]
print('Removida descrições muito curtas')


Removida as linhas com valores nulos em title ou content
Removida entradas duplicadas
Removido os caracteres especiais
Aplicada função de limpeza
Removida descrições muito curtas


In [None]:
df.shape

(955366, 2)

In [None]:
df.head(50)

Unnamed: 0,title,content
0,girls ballet tutu neon pink,high quality 3 layer ballet tutu 12 inches in ...
3,mogs kittens,judith kerr8217s best8211selling adventures of...
7,girls ballet tutu neon blue,dance tutu for girls ages 28 years perfect for...
12,the prophet,in a distant timeless place a mysterious proph...
16,autumn story brambly hedge,the most researchcrammed fantasy ever set befo...
17,spirit ledmoving by grace in the holy spirits ...,you can flow effortlessly and powerfully in th...
18,the very bad bunny beginner series,by marilyn sadler illustrated by roger bollen
22,nice for mice,jill barklem was born in epping in 1951 after ...
23,the book of revelation,american baptist pastor bible teacher and writ...
28,in the shadow of man,an instant animal classic timeapart from its...


In [None]:
import numpy as np

# Dividir o DataFrame em 4 partes
df_split = np.array_split(df, 4)

# Salvar cada parte em um arquivo JSON separado
for i, df_part in enumerate(df_split, start=1):
    output_filename = f'/content/drive/MyDrive/Colab Notebooks/json_parte_{i}.json'
    df_part.to_json(output_filename, orient='records', lines=True, force_ascii=False)
    print(f"Parte {i} dos dados foi salva em '{output_filename}'.")


Parte 1 dos dados foi salva em '/content/drive/MyDrive/Colab Notebooks/json_parte_1.json'.
Parte 2 dos dados foi salva em '/content/drive/MyDrive/Colab Notebooks/json_parte_2.json'.
Parte 3 dos dados foi salva em '/content/drive/MyDrive/Colab Notebooks/json_parte_3.json'.
Parte 4 dos dados foi salva em '/content/drive/MyDrive/Colab Notebooks/json_parte_4.json'.


In [None]:
df.shape

(238842, 4)

In [None]:

df.head()

Unnamed: 0,title,content,prompt,response
0,girls ballet tutu neon pink,high quality 3 layer ballet tutu 12 inches in ...,Qual a descrição em 50 caracteres do produto: ...,high quality 3 layer ballet tutu 12 inches in ...
3,mogs kittens,judith kerr8217s best8211selling adventures of...,Qual a descrição em 50 caracteres do produto: ...,judith kerr8217s best8211selling adventures of...
7,girls ballet tutu neon blue,dance tutu for girls ages 28 years perfect for...,Qual a descrição em 50 caracteres do produto: ...,dance tutu for girls ages 28 years perfect for...
12,the prophet,in a distant timeless place a mysterious proph...,Qual a descrição em 50 caracteres do produto: ...,in a distant timeless place a mysterious proph...
16,autumn story brambly hedge,the most researchcrammed fantasy ever set befo...,Qual a descrição em 50 caracteres do produto: ...,the most researchcrammed fantasy ever set befo...
