# LSTM

In [1]:
import pandas as pd
import os 
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
pd.set_option('display.max_columns', None)
current_dir = os.getcwd()

In [3]:
import pandas as pd
from collections import defaultdict
import math
import os

# Parámetros
chunk_size = 100_000
total_data_path = os.path.join(current_dir, '../data/raw/total_data.csv')
target_rows = 1_000_000
min_items = 10
store_ids = ['CA_1', 'CA_2', 'CA_3', 'CA_4', 'TX_1', 'TX_2', 'TX_3', 'WI_1', 'WI_2', 'WI_3']

# Paso 1: Obtener todas las fechas únicas
all_dates = set()
for chunk in pd.read_csv(total_data_path, chunksize=chunk_size, usecols=['date']):
    all_dates.update(chunk['date'].unique())
all_dates = sorted(list(all_dates))
D = len(all_dates)
print(f"Total de fechas únicas: {D}")

# Paso 2 y 3: Procesar cada tienda
all_filtered_data = []

for store_id_selected in store_ids:
    print(f"\nProcesando tienda: {store_id_selected}")
    item_date_counts = defaultdict(set)

    # Paso 2: Identificar los artículos que aparecen en todas las fechas para esta tienda
    for chunk in pd.read_csv(total_data_path, chunksize=chunk_size, usecols=['item_id', 'date', 'store_id']):
        chunk['store_id'] = chunk['store_id'].astype(str)
        chunk_filtrado = chunk[chunk['store_id'] == store_id_selected]
        for item, date in zip(chunk_filtrado['item_id'], chunk_filtrado['date']):
            item_date_counts[item].add(date)

    # Filtrar productos que están en todas las fechas
    items_in_all_dates = [item for item, dates in item_date_counts.items() if len(dates) == D]
    N = min(min_items, len(items_in_all_dates))
    selected_items = items_in_all_dates[:N]

    print(f"Seleccionados {N} productos que aparecen en todas las fechas.")

    if N == 0:
        print(f"Ningún producto válido en {store_id_selected}. Se omite esta tienda.")
        continue

    # Paso 3: Filtrar el dataset para estos productos y esta tienda
    filtered_data = []
    for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
        chunk['store_id'] = chunk['store_id'].astype(str)
        chunk_filtrado = chunk[
            (chunk['item_id'].isin(selected_items)) &
            (chunk['date'].isin(all_dates)) &
            (chunk['store_id'] == store_id_selected)
        ]
        filtered_data.append(chunk_filtrado)

    final_data_store = pd.concat(filtered_data, ignore_index=True)
    print(f"Filas obtenidas: {len(final_data_store):,}")
    all_filtered_data.append(final_data_store)

# Concatenar datos de todas las tiendas
final_data = pd.concat(all_filtered_data, ignore_index=True)

# Limitar tamaño si excede el objetivo
if len(final_data) > target_rows:
    final_data = final_data.sample(n=target_rows, random_state=42)
    print(f"\nEl dataset ha sido reducido a {target_rows} filas.")

# Verificación final
print(f"\nDataset final combinado:")
print(f"Filas totales: {len(final_data):,}")
print(f"Items únicos: {final_data['item_id'].nunique()}")
print(f"Tiendas únicas: {final_data['store_id'].nunique()}")
print(f"Fechas únicas: {final_data['date'].nunique()}")

Total de fechas únicas: 1941

Procesando tienda: CA_1
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: CA_2
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: CA_3
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: CA_4
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: TX_1
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: TX_2
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: TX_3
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: WI_1
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: WI_2
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Procesando tienda: WI_3
Seleccionados 10 productos que aparecen en todas las fechas.


  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_size):
  for chunk in pd.read_csv(total_data_path, chunksize=chunk_si

  → Filas obtenidas: 19,410

Dataset final combinado:
Filas totales: 194,100
Items únicos: 10
Tiendas únicas: 10
Fechas únicas: 1941


In [4]:
final_data_path = os.path.join(current_dir, '../data/raw/Dataframe_Final_Data_LSTM.csv')

final_data.to_csv(final_data_path, index=False)