In [30]:
import os
import pandas as pd
from langchain_community.document_loaders import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [22]:
df = pd.read_csv(r'C:\Users\Diane Alves\Desktop\BACK-DOMROCK\src\data\chat_data.csv')

In [23]:
print(df.columns)

Index(['submission_date', 'reviewer_id', 'product_id', 'product_name',
       'product_brand', 'site_category_lv1', 'site_category_lv2',
       'review_title', 'overall_rating', 'recommend_to_a_friend',
       'review_text', 'reviewer_birth_year', 'reviewer_gender',
       'reviewer_state'],
      dtype='object')


In [24]:
# define as colunas do csv que serão usadas
useful_data = ['product_name', 'product_brand', 'site_category_lv1', 'site_category_lv2', 'overall_rating', 'review_text', 'reviewer_gender', 'submission_date', 'reviewer_birth_year']

In [25]:
# carrega o csv
def load_data(file_path):
    # lê o arquivo csv
    df = pd.read_csv(file_path)
    
    # exclui as colunas que não serão usadas
    df_reduced = df.drop(columns=[col for col in df.columns if col not in useful_data])

    # limpa e normaliza o texto
    for column in useful_data:
        df_reduced[column] = df_reduced[column].apply(lambda x: normalize_text(str(x)))
        df_reduced[column] = df_reduced[column].apply(lambda x: remove_stopwords(str(x)))

    # define o nome e o caminho do arquivo de saída
    result_file_name = 'data_processed.csv'  # nome do arquivo de saída
    new_file_path = os.path.join('data', result_file_name)  # caminho para salvar o arquivo de saída na pasta 'data'
    
    # salva os dados em csv
    df_reduced.to_csv(new_file_path, index=False)

    return new_file_path

In [26]:
# define o nome e o caminho do arquivo de saída
result_file_name = 'data_processed.csv'  # nome do arquivo de saída
new_file_path = os.path.join('data', result_file_name)  # caminho para salvar o arquivo de saída na pasta 

In [27]:
# Retorna o caminho do arquivo gerado
print(f'Arquivo salvo em: {new_file_path}')

Arquivo salvo em: data\data_processed.csv


In [None]:
def load_and_chunk(file_path):
    # gera os documentos com o csv processado
    loader = CSVLoader(file_path=file_path, encoding='utf-8', csv_args={
        'delimiter': ',', 'quotechar': '"', 'fieldnames': useful_data
    }) # classe do langchain para manipular os dados
    # delimiter define o que separa as colunas do csv (no caso, uma vírgula)
    # quotechar define o caractere que envolve strings, permitindo que uma string tenha vírgulas sem ser divididas em colunas
    # fieldnames são os nomes das colunas que serão carregadas

    docs = loader.load() # retorna os documentos gerados

    # chunkeniza e adiciona overlap nos dados
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=150)
    # text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1050, chunk_overlap=250)
    splits = text_splitter.split_documents(docs)

    return splits

In [None]:
# Caminho do arquivo processado (assumindo que você já gerou o arquivo com a função anterior)
processed_file_path = r'C:\Users\Diane Alves\Desktop\BACK-DOMROCK\src\data\chat_data.csv'

# Chama a função e armazena os splits retornados
splits = load_and_chunk(processed_file_path)

# Exibe os resultados
print(splits)

[Document(metadata={'source': 'C:\\Users\\Diane Alves\\Desktop\\BACK-DOMROCK\\src\\data\\chat_data.csv', 'row': 0}, page_content='product_name: submission_date\nproduct_brand: reviewer_id\nsite_category_lv1: product_id\nsite_category_lv2: product_name\noverall_rating: product_brand\nreview_text: site_category_lv1\nreviewer_gender: site_category_lv2\nsubmission_date: review_title\nreviewer_birth_year: overall_rating\nNone: recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state'), Document(metadata={'source': 'C:\\Users\\Diane Alves\\Desktop\\BACK-DOMROCK\\src\\data\\chat_data.csv', 'row': 1}, page_content='product_name: 2018-01-01 00:11:28\nproduct_brand: d0fb1ca69422530334178f5c8624aa7a99da47907c44de0243719b15d50623ce\nsite_category_lv1: 132532965\nsite_category_lv2: Notebook Asus Vivobook Max X541NA-GO472T Intel Celeron Quad Core 4GB 500GB Tela LED 15,6" Windows - 10 Branco\noverall_rating: \nreview_text: Informática\nreviewer_gender: Notebook\nsubmission

In [46]:
# Exibe apenas os primeiros 5 splits
for i, split in enumerate(splits[:5]):
    print(f"Split {i + 1}:\n{split}\n")

Split 1:
page_content='product_name: submission_date
product_brand: reviewer_id
site_category_lv1: product_id
site_category_lv2: product_name
overall_rating: product_brand
review_text: site_category_lv1
reviewer_gender: site_category_lv2
submission_date: review_title
reviewer_birth_year: overall_rating
None: recommend_to_a_friend,review_text,reviewer_birth_year,reviewer_gender,reviewer_state' metadata={'source': 'C:\\Users\\Diane Alves\\Desktop\\BACK-DOMROCK\\src\\data\\chat_data.csv', 'row': 0}

Split 2:
page_content='product_name: 2018-01-01 00:11:28
product_brand: d0fb1ca69422530334178f5c8624aa7a99da47907c44de0243719b15d50623ce
site_category_lv1: 132532965
site_category_lv2: Notebook Asus Vivobook Max X541NA-GO472T Intel Celeron Quad Core 4GB 500GB Tela LED 15,6" Windows - 10 Branco
overall_rating: 
review_text: Informática
reviewer_gender: Notebook
submission_date: Bom
reviewer_birth_year: 4
None: Yes,Estou contente com a compra entrega rápida o único problema com as Americanas é s