In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#Rutas
ruta_datos = '/content/drive/MyDrive/Laboratorio_3/data/sell-in.txt'
ruta_productos = '/content/drive/MyDrive/Laboratorio_3/data/tb_productos.txt'
ruta_stocks = '/content/drive/MyDrive/Laboratorio_3/data/tb_stocks.txt'
ruta_predecir = '/content/drive/MyDrive/Laboratorio_3/data/product_id_apredecir201912.txt'

#Leer archivos
df = pd.read_csv(ruta_datos, sep='\t', decimal='.')
df_productos = pd.read_csv(ruta_productos, sep='\t', decimal='.')
df_stocks = pd.read_csv(ruta_stocks, sep='\t', decimal='.')
df_predecir = pd.read_csv(ruta_predecir, sep='\t')

In [5]:
#Convertir 'periodo' de formato YYYYMM en df
df['periodo'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')

#Crear columna periodo_ym
df['periodo_ym'] = df['periodo'].dt.strftime('%Y%m').astype(int)

# Regresi√≥n Lineal

In [6]:
# Cargar dataset de sell-in
ruta_datos = '/content/drive/MyDrive/Laboratorio_3/data/sell-in.txt'
df = pd.read_csv(ruta_datos, sep='\t', decimal='.')

# Convertir campo 'periodo' a datetime y crear campo periodo_ym
df['periodo'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')
df['periodo_ym'] = df['periodo'].dt.strftime('%Y%m').astype(int)


In [7]:
# Sumar tn por product_id y periodo
df_grouped = df.groupby(['product_id', 'periodo_ym'])['tn'].sum().reset_index()

# Pivotear para que cada fila tenga un producto y columnas de meses
df_pivot = df_grouped.pivot(index='product_id', columns='periodo_ym', values='tn')

# Asegurarse que las columnas (periodos) est√©n ordenadas cronol√≥gicamente
periodos = sorted(df_pivot.columns.tolist())

# Generar variables lag tn, tn_1, ..., tn_11
for lag in range(12):
    df_pivot[f'tn_{lag}'] = df_pivot[periodos].shift(lag, axis=1).iloc[:, -1]

# Calcular campo target (clase): tn en periodo +2 (mes futuro)
df_pivot['clase'] = df_pivot[periodos].shift(-2, axis=1).iloc[:, -1]

# Seleccionar registros del periodo 201812 como base
periodo_train = 201812
X = df_pivot.loc[:, [f'tn_{i}' for i in range(12)]].loc[df_pivot[periodo_train].notna()]
y = df_pivot['clase'].loc[X.index]




In [8]:
# Lista de registros m√°gicos
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

# Selecci√≥n de features y clase
features = [f'tn_{i}' for i in range(12)]
X_magicos = df_pivot.loc[df_pivot.index.isin(magicos), features]
y_magicos = df_pivot.loc[df_pivot.index.isin(magicos), 'clase']

# Eliminar registros con NaN
validos = X_magicos.dropna().index.intersection(y_magicos.dropna().index)
X_train = X_magicos.loc[validos]
y_train = y_magicos.loc[validos]

print(f"üßô‚Äç‚ôÇÔ∏è Registros m√°gicos v√°lidos para entrenamiento: {X_train.shape[0]}")


üßô‚Äç‚ôÇÔ∏è Registros m√°gicos v√°lidos para entrenamiento: 0


In [9]:
# Revisar cu√°ntos m√°gicos tienen datos completos
completos = []
for pid in magicos:
    fila = df_pivot.loc[pid, features + ['clase']]
    if fila.notna().all():
        completos.append(pid)

print(f"üéØ Registros m√°gicos realmente completos: {len(completos)}")
print(completos)


üéØ Registros m√°gicos realmente completos: 0
[]


In [None]:
# Entrenar modelo
modelo = LinearRegression()
modelo.fit(X_train, y_train)

# Mostrar coeficientes
coeficientes = pd.DataFrame({
    'coeficiente': ['intercept'] + list(X_train.columns),
    'valor': [modelo.intercept_] + list(modelo.coef_)
})
print(coeficientes)


ValueError: Found array with 0 sample(s) (shape=(0, 12)) while a minimum of 1 is required by LinearRegression.

In [None]:
# Dataset con datos completos de tn a tn_11
features = ['tn_'+str(i) for i in range(12)]
df_features = df_pivot[features]
df_features = df_features.dropna()

# Predicci√≥n con modelo entrenado
predicciones = modelo.predict(df_features)
df_resultado = pd.DataFrame({'product_id': df_features.index, 'tn_pred': predicciones})


In [None]:
# Calcular promedio de los registros incompletos
faltantes = df_pivot[features].isna().any(axis=1)
media_pred = df_pivot.loc[faltantes, periodo_train + 1].mean()

# Crear df con completados + promedio para incompletos
df_incompletos = pd.DataFrame({'product_id': df_pivot.index[faltantes], 'tn_pred': media_pred})
df_final = pd.concat([df_resultado, df_incompletos], axis=0).sort_values('product_id').reset_index(drop=True)


In [None]:
output_path = '/content/drive/MyDrive/Laboratorio_3/prediccion_regresion_lineal.csv'
df_final.to_csv(output_path, index=False, sep=',', decimal='.')
print(f"‚úÖ Exportado a {output_path} con {len(df_final)} registros.")


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# ====================
# üì• CARGA DE DATOS
# ====================
ruta_datos = '/content/drive/MyDrive/Laboratorio_3/data/sell-in.txt'
df = pd.read_csv(ruta_datos, sep='\t', decimal='.')

df['periodo'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')
df['periodo_ym'] = df['periodo'].dt.strftime('%Y%m').astype(int)

# ============================
# üß± TRANSFORMACI√ìN A MATRIZ
# ============================
df_grouped = df.groupby(['product_id', 'periodo_ym'])['tn'].sum().reset_index()
df_pivot = df_grouped.pivot(index='product_id', columns='periodo_ym', values='tn')
df_pivot = df_pivot.sort_index(axis=1)  # ordenar por periodo

# ============================
# üîÅ CREAR LAGS tn_0 a tn_11
# ============================
periodos = df_pivot.columns.tolist()

for i in range(12):
    df_pivot[f'tn_{i}'] = df_pivot.shift(i, axis=1).iloc[:, -1]

# ============================
# üéØ CREAR TARGET: tn del mes +2
# ============================
df_pivot['clase'] = df_pivot.shift(-2, axis=1).iloc[:, -1]

# ============================
# üßô‚Äç‚ôÇÔ∏è USAR REGISTROS M√ÅGICOS
# ============================
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20008, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

features = [f'tn_{i}' for i in range(12)]

# Validar cu√°les tienen info completa
completos = []
for pid in magicos:
    fila = df_pivot.loc[pid, features + ['clase']]
    if fila.notna().all():
        completos.append(pid)

print(f"üéØ Registros m√°gicos realmente completos: {len(completos)}")
print(completos)

# ============================
# üìä ENTRENAR REGRESI√ìN LINEAL
# ============================
if len(completos) > 0:
    X_train = df_pivot.loc[completos, features]
    y_train = df_pivot.loc[completos, 'clase']

    modelo = LinearRegression()
    modelo.fit(X_train, y_train)

    coeficientes = pd.DataFrame({
        'coeficiente': ['intercept'] + features,
        'valor': [modelo.intercept_] + list(modelo.coef_)
    })
    print("\nüìà Coeficientes del modelo:")
    print(coeficientes)
else:
    print("‚ùå No hay registros m√°gicos con datos completos para entrenar.")


üéØ Registros m√°gicos realmente completos: 0
[]
‚ùå No hay registros m√°gicos con datos completos para entrenar.


In [None]:
print("Columnas disponibles (periodo):")
print([c for c in df_pivot.columns if isinstance(c, int)])


Columnas disponibles (periodo):
[201701, 201702, 201703, 201704, 201705, 201706, 201707, 201708, 201709, 201710, 201711, 201712, 201801, 201802, 201803, 201804, 201805, 201806, 201807, 201808, 201809, 201810, 201811, 201812, 201901, 201902, 201903, 201904, 201905, 201906, 201907, 201908, 201909, 201910, 201911, 201912]


In [None]:
pid = 20002  # prob√° con otros si quer√©s
print(df_pivot.loc[pid])


periodo_ym
201701     550.15707
201702     505.88633
201703     834.73521
201704     522.35365
201705     843.43785
201706     968.15756
201707     845.39319
201708     619.71078
201709    1065.34529
201710     857.45269
201711     750.41853
201712     820.58984
201801     984.80167
201802     712.00087
201803     966.86044
201804     999.20934
201805    1103.39191
201806    1033.82845
201807     977.40239
201808    1161.88430
201809     954.23575
201810    1378.49032
201811    1766.81068
201812    1009.45458
201901    1266.78751
201902    1043.01349
201903    1083.62552
201904    1287.62346
201905    1034.98927
201906     928.36431
201907    1066.44999
201908     813.78215
201909    1090.18771
201910    1979.53635
201911    1423.57739
201912    1087.30855
tn_0      1087.30855
tn_1      1087.30855
tn_2      1087.30855
tn_3      1087.30855
tn_4      1087.30855
tn_5      1087.30855
tn_6      1087.30855
tn_7      1087.30855
tn_8      1087.30855
tn_9      1087.30855
tn_10     1087.30855
tn

In [None]:
# ================================
# üîç Detectar registros con historia completa
# ================================
features = [f'tn_{i}' for i in range(12)]

# Recalcular lags y clase (por si se reinicia)
df_grouped = df.groupby(['product_id', 'periodo_ym'])['tn'].sum().reset_index()
df_pivot = df_grouped.pivot(index='product_id', columns='periodo_ym', values='tn').sort_index(axis=1)
periodos = df_pivot.columns.tolist()

for i in range(12):
    df_pivot[f'tn_{i}'] = df_pivot.shift(i, axis=1).iloc[:, -1]

df_pivot['clase'] = df_pivot.shift(-2, axis=1).iloc[:, -1]

# Filtrar registros completos
df_entrenamiento = df_pivot[features + ['clase']].dropna()

print(f"‚úÖ Nuevos registros completos detectados: {df_entrenamiento.shape[0]}")


‚úÖ Nuevos registros completos detectados: 0


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# =========================
# CARGA DE DATOS
# =========================
ruta_datos = '/content/drive/MyDrive/Laboratorio_3/data/sell-in.txt'
df = pd.read_csv(ruta_datos, sep='\t', decimal='.')
df['periodo'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')
df['periodo_ym'] = df['periodo'].dt.strftime('%Y%m').astype(int)

# =========================
# Par√°metros
# =========================
periodo_base = 201812
periodos_lag = [periodo_base - i for i in range(12)]
periodo_clase = 201902

# =========================
# Generar matriz por producto
# =========================
df_grouped = df.groupby(['product_id', 'periodo_ym'])['tn'].sum().reset_index()
df_pivot = df_grouped.pivot(index='product_id', columns='periodo_ym', values='tn')

# Filtrar solo los per√≠odos relevantes
periodos_usar = periodos_lag + [periodo_clase]
df_subset = df_pivot[periodos_usar].dropna()

# Renombrar columnas para usar como features
df_subset = df_subset.rename(columns={k: f'tn_{i}' for i, k in enumerate(periodos_lag)})
df_subset = df_subset.rename(columns={periodo_clase: 'clase'})

# =========================
# Entrenar modelo
# =========================
features = [f'tn_{i}' for i in range(12)]
X_train = df_subset[features]
y_train = df_subset['clase']

modelo = LinearRegression()
modelo.fit(X_train, y_train)

# Mostrar coeficientes
coeficientes = pd.DataFrame({
    'coeficiente': ['intercept'] + features,
    'valor': [modelo.intercept_] + list(modelo.coef_)
})
print("üìà Coeficientes del modelo:")
print(coeficientes)


üìà Coeficientes del modelo:
   coeficiente     valor
0    intercept  2.001493
1         tn_0  0.281376
2         tn_1  0.143025
3         tn_2  0.178707
4         tn_3 -0.029380
5         tn_4 -0.185506
6         tn_5 -0.044007
7         tn_6  0.065518
8         tn_7  0.038292
9         tn_8  0.179474
10        tn_9  0.031348
11       tn_10  0.111347
12       tn_11  0.096130


In [None]:
# =========================
# Cargar lista de productos a predecir
# =========================
ruta_predecir = '/content/drive/MyDrive/Laboratorio_3/data/product_id_apredecir201912.txt'
df_apredecir = pd.read_csv(ruta_predecir, sep='\t')

# =========================
# Extraer √∫ltimos 12 meses (hasta 201912 hacia atr√°s)
# =========================
periodo_pred = 201912
lags_pred = [periodo_pred - i for i in range(12)]  # de tn_0 a tn_11

df_features = df_pivot[lags_pred].copy()
df_features.columns = [f'tn_{i}' for i in range(12)]  # renombrar como features

# Unir con lista de product_id
df_pred = df_apredecir.merge(df_features, on='product_id', how='left')

# Separar registros completos e incompletos
completos = df_pred[features].notna().all(axis=1)
incompletos = ~completos

# =========================
# Predecir con modelo
# =========================
df_pred['tn_pred'] = np.nan
df_pred.loc[completos, 'tn_pred'] = modelo.predict(df_pred.loc[completos, features])

# Fallback: promedio de predichos completos
media_fallback = df_pred.loc[completos, 'tn_pred'].mean()
df_pred['tn_pred'] = df_pred['tn_pred'].fillna(media_fallback)

# =========================
# Exportar resultados
# =========================
output_path = '/content/drive/MyDrive/Laboratorio_3/regresion-prediccion-febrero2020.csv'
df_pred[['product_id', 'tn_pred']].to_csv(output_path, index=False, sep=',', decimal='.')

print(f"‚úÖ Exportado correctamente: {len(df_pred)} registros")
print(f"üìç Ruta: {output_path}")


‚úÖ Exportado correctamente: 780 registros
üìç Ruta: /content/drive/MyDrive/Laboratorio_3/regresion-prediccion-febrero2020.csv


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# =========================================
# üîÅ Recalcular lags y clase desde cero
# =========================================
features = [f'tn_{i}' for i in range(12)]

# Sumar tn por producto y periodo
df_grouped = df.groupby(['product_id', 'periodo_ym'])['tn'].sum().reset_index()

# Pivotear a formato ancho: filas = producto, columnas = meses
df_pivot = df_grouped.pivot(index='product_id', columns='periodo_ym', values='tn').sort_index(axis=1)

# Extraer los nombres de los per√≠odos ordenados (columna m√°s vieja a m√°s nueva)
periodos = df_pivot.columns.tolist()

# A partir de la columna 201801, generar lags hacia atr√°s (tn_0 es 201812)
for i in range(12):
    df_pivot[f'tn_{i}'] = df_pivot.iloc[:, -3 - i]  # 201812 = -3 ‚Üí tn_0

# Crear campo clase: tn del per√≠odo 201902 (es la √∫ltima columna)
df_pivot['clase'] = df_pivot.iloc[:, -1]  # tn en 201902

# =========================================
# ‚úÖ Filtrar registros con datos completos
# =========================================
columnas_necesarias = features + ['clase']
df_entrenamiento = df_pivot[columnas_necesarias].dropna()

print(f"‚úÖ Registros completos disponibles para entrenamiento: {df_entrenamiento.shape[0]}")

# =========================================
# ü§ñ Entrenar regresi√≥n lineal
# =========================================
if df_entrenamiento.shape[0] > 0:
    X_train = df_entrenamiento[features]
    y_train = df_entrenamiento['clase']

    modelo = LinearRegression()
    modelo.fit(X_train, y_train)

    # Mostrar coeficientes
    coeficientes = pd.DataFrame({
        'coeficiente': ['intercept'] + features,
        'valor': [modelo.intercept_] + list(modelo.coef_)
    })

    print("\nüìã Coeficientes del modelo:")
    print(coeficientes)
else:
    print("‚ùå No hay registros completos para entrenar.")


‚úÖ Registros completos disponibles para entrenamiento: 952

üìã Coeficientes del modelo:
   coeficiente     valor
0    intercept  0.000000
1         tn_0  0.083333
2         tn_1  0.083333
3         tn_2  0.083333
4         tn_3  0.083333
5         tn_4  0.083333
6         tn_5  0.083333
7         tn_6  0.083333
8         tn_7  0.083333
9         tn_8  0.083333
10        tn_9  0.083333
11       tn_10  0.083333
12       tn_11  0.083333


In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# ================================
# üîç Detectar registros con historia completa
# ================================

# Definir los 12 features
features = [f'tn_{i}' for i in range(12)]

# Aseguramos recalcular bien el pivot por si cambi√≥ el dataset original
df_grouped = df.groupby(['product_id', 'periodo_ym'])['tn'].sum().reset_index()

# Crear la tabla pivote ordenada por periodo
df_pivot = df_grouped.pivot(index='product_id', columns='periodo_ym', values='tn').sort_index(axis=1)

# Definir el periodo final de an√°lisis
periodo_final = 201812
periodos = df_pivot.columns.tolist()

# Detectar la posici√≥n de 201812 y calcular los 12 meses anteriores
pos_final = periodos.index(periodo_final)
lags = periodos[pos_final - 11: pos_final + 1]  # tn_11 a tn_0 (inclusive)
clase_periodo = periodos[pos_final + 2]         # tn de periodo +2 => 201902

# Confirmar
print(f"‚úÖ Usando lags de: {lags[0]} a {lags[-1]}")
print(f"üéØ Clase objetivo (mes +2): {clase_periodo}")

# Crear columnas tn_0 a tn_11
for i, periodo in enumerate(reversed(lags)):
    df_pivot[f'tn_{i}'] = df_pivot[periodo]

# Crear clase
df_pivot['clase'] = df_pivot[clase_periodo]

# Dataset de entrenamiento final
df_entrenamiento = df_pivot[features + ['clase']].dropna()

print(f"‚úÖ Registros completos disponibles para entrenamiento: {df_entrenamiento.shape[0]}")

# ================================
# ü§ñ Entrenar modelo
# ================================

X_train = df_entrenamiento[features]
y_train = df_entrenamiento['clase']

modelo = LinearRegression()
modelo.fit(X_train, y_train)

# ================================
# üìä Mostrar coeficientes
# ================================

coeficientes = pd.DataFrame({
    'coeficiente': ['intercept'] + features,
    'valor': [modelo.intercept_] + list(modelo.coef_)
})

print("\nüìÑ Coeficientes del modelo:")
print(coeficientes)


‚úÖ Usando lags de: 201801 a 201812
üéØ Clase objetivo (mes +2): 201902
‚úÖ Registros completos disponibles para entrenamiento: 755

üìÑ Coeficientes del modelo:
   coeficiente     valor
0    intercept  2.001493
1         tn_0  0.281376
2         tn_1  0.143025
3         tn_2  0.178707
4         tn_3 -0.029380
5         tn_4 -0.185506
6         tn_5 -0.044007
7         tn_6  0.065518
8         tn_7  0.038292
9         tn_8  0.179474
10        tn_9  0.031348
11       tn_10  0.111347
12       tn_11  0.096130


In [None]:
# ============================
# üìÇ Cargar dataset
# ============================
import pandas as pd
from sklearn.linear_model import LinearRegression

df = pd.read_csv('/content/drive/MyDrive/Laboratorio_3/dataset_entrenamiento.csv')

# ============================
# üîÅ Recalcular lags y clase
# ============================
df_grouped = df.groupby(['product_id', 'periodo_ym'])['tn'].sum().reset_index()
df_pivot = df_grouped.pivot(index='product_id', columns='periodo_ym', values='tn').sort_index(axis=1)

# Lags
for i in range(12):
    df_pivot[f'tn_{i}'] = df_pivot.shift(i, axis=1).iloc[:, -1]

# Clase = tn del per√≠odo objetivo (mes +2 ‚Üí 201902)
df_pivot['clase'] = df_pivot.shift(-2, axis=1).iloc[:, -1]

# ============================
# üéØ Filtrar registros completos (para entrenamiento)
# ============================
features = [f'tn_{i}' for i in range(12)]
df_entrenamiento = df_pivot[features + ['clase']].dropna()

print(f"‚úÖ Registros completos disponibles para entrenamiento: {df_entrenamiento.shape[0]}")

# ============================
# üß† Entrenar modelo
# ============================
X_train = df_entrenamiento[features]
y_train = df_entrenamiento['clase']

modelo = LinearRegression()
modelo.fit(X_train, y_train)

# Coeficientes
coeficientes = pd.DataFrame({
    'coeficiente': ['intercept'] + features,
    'valor': [modelo.intercept_] + list(modelo.coef_)
})
print("\nüìÑ Coeficientes del modelo:")
print(coeficientes)

# ============================
# üîÆ Predecir para 201812
# ============================
# Usar los registros que tengan lags desde 201801 a 201812
periodos_lags = list(sorted(df_pivot.columns[:12]))  # 201801‚Äì201812
validos = df_pivot[features].dropna()
X_pred = validos[features]

df_pred = validos[['product_id']].copy()
df_pred['tn_pred'] = modelo.predict(X_pred)

# ============================
# üíæ Exportar resultados
# ============================
output_path = '/content/drive/MyDrive/Laboratorio_3/regresion-lineal.csv'
df_pred[['product_id', 'tn_pred']].to_csv(output_path, index=False, sep=',', decimal='.')


print(f"\n‚úÖ Exportado correctamente: {len(df_pred)} registros")
print(f"üìç Ruta: {output_path}")



FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/Laboratorio_3/dataset_entrenamiento.csv'

In [None]:
# =========================
# ‚öôÔ∏è Preparar el dataset
# =========================

# Creamos las features: tn_0 a tn_11 (de 201812 hacia atr√°s)
lags = [f"tn_{i}" for i in range(12)]
df_lags = df.copy()

for i in range(12):
    df_lags[f"tn_{i}"] = df_lags.groupby("product_id")["tn"].shift(i)

# Creamos la variable objetivo: clase = tn en periodo +2
df_lags["clase"] = df_lags.groupby("product_id")["tn"].shift(-2)

# Filtramos registros del per√≠odo base: 201812
df_train = df_lags[df_lags["periodo"] == 201812].copy()

# Eliminamos registros con NaN
df_train = df_train.dropna(subset=lags + ["clase"])

# =========================
# üìà Entrenar modelo
# =========================

from sklearn.linear_model import LinearRegression

X = df_train[lags]
y = df_train["clase"]

modelo = LinearRegression()
modelo.fit(X, y)

# Mostramos coeficientes
import pandas as pd

coefs = pd.DataFrame({
    "coeficiente": ["intercept"] + lags,
    "valor": [modelo.intercept_] + list(modelo.coef_)
})
print("üìã Coeficientes del modelo:")
print(coefs)

# =========================
# üìä Predicci√≥n
# =========================

# Aplicamos el modelo sobre los 780 registros a predecir
df_pred = df_lags[df_lags["periodo"] == 201912].copy()
df_pred = df_pred.dropna(subset=lags)

X_pred = df_pred[lags]
df_pred["tn_pred"] = modelo.predict(X_pred)

# =========================
# üíæ Exportar resultados
# =========================

output_path = '/content/drive/MyDrive/Laboratorio_3/regresion-prediccion-febrero2020.csv'
df_pred[['product_id', 'tn_pred']].to_csv(output_path, index=False, sep=';', decimal='.')

print(f"\n‚úÖ Exportado correctamente: {len(df_pred)} registros")
print(f"üìç Ruta: {output_path}")


ValueError: Found array with 0 sample(s) (shape=(0, 12)) while a minimum of 1 is required by LinearRegression.

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# =========================
# CARGA DE DATOS
# =========================
ruta_datos = '/content/drive/MyDrive/Laboratorio_3/data/sell-in.txt'
df = pd.read_csv(ruta_datos, sep='\t', decimal='.')
df['periodo'] = pd.to_datetime(df['periodo'].astype(str), format='%Y%m')
df['periodo_ym'] = df['periodo'].dt.strftime('%Y%m').astype(int)

# =========================
# Filtrado de 33 productos m√°gicos
# =========================
magicos = [
    20002, 20003, 20006, 20010, 20011, 20018, 20019, 20021,
    20026, 20028, 20035, 20039, 20042, 20044, 20045, 20046, 20049,
    20051, 20052, 20053, 20055, 20061, 20001, 20017, 20086, 20180,
    20193, 20320, 20532, 20612, 20637, 20807, 20838
]

# =========================
# Armar matriz producto vs mes
# =========================
df_grouped = df.groupby(['product_id', 'periodo_ym'])['tn'].sum().reset_index()
df_pivot = df_grouped.pivot(index='product_id', columns='periodo_ym', values='tn')

# =========================
# Entrenamiento sobre m√°gicos (usando periodo base 201812 hacia atr√°s)
# =========================
periodo_base = 201812
periodo_clase = 201902
lags = [periodo_base - i for i in range(12)]  # tn_0 a tn_11
cols_usar = lags + [periodo_clase]

df_subset = df_pivot.loc[magicos, cols_usar].dropna()
df_subset = df_subset.rename(columns={k: f'tn_{i}' for i, k in enumerate(lags)})
df_subset = df_subset.rename(columns={periodo_clase: 'clase'})

features = [f'tn_{i}' for i in range(12)]
X_train = df_subset[features]
y_train = df_subset['clase']

modelo = LinearRegression()
modelo.fit(X_train, y_train)

# Mostrar coeficientes
coeficientes = pd.DataFrame({
    'coeficiente': ['intercept'] + features,
    'valor': [modelo.intercept_] + list(modelo.coef_)
})
print("üìà Coeficientes del modelo:")
print(coeficientes)

# =========================
# Predicci√≥n para productos de 201912
# =========================
ruta_predecir = '/content/drive/MyDrive/Laboratorio_3/data/product_id_apredecir201912.txt'
df_apredecir = pd.read_csv(ruta_predecir, sep='\t')

periodo_pred = 201812
lags_pred = [periodo_pred - i for i in range(12)]

df_features = df_pivot[lags_pred].copy()
df_features.columns = [f'tn_{i}' for i in range(12)]

df_pred = df_apredecir.merge(df_features, on='product_id', how='left')

completos = df_pred[features].notna().all(axis=1)
incompletos = ~completos

df_pred['tn_pred'] = np.nan
df_pred.loc[completos, 'tn_pred'] = modelo.predict(df_pred.loc[completos, features])

media_fallback = df_pred.loc[completos, 'tn_pred'].mean()
df_pred['tn_pred'] = df_pred['tn_pred'].fillna(media_fallback)

# =========================
# Exportar resultados
# =========================
output_path = '/content/drive/MyDrive/Laboratorio_3/regresion2-prediccion-febrero2020.csv'
df_pred[['product_id', 'tn_pred']].to_csv(output_path, index=False, sep=',', decimal='.')

print(f"‚úÖ Exportado correctamente: {len(df_pred)} registros")
print(f"üìç Ruta: {output_path}")


üìà Coeficientes del modelo:
   coeficiente     valor
0    intercept  1.063187
1         tn_0  0.003748
2         tn_1  0.251661
3         tn_2  0.167069
4         tn_3 -0.042321
5         tn_4 -0.172354
6         tn_5 -0.005037
7         tn_6  0.136016
8         tn_7  0.037226
9         tn_8  0.143196
10        tn_9  0.110305
11       tn_10  0.114931
12       tn_11  0.070633
‚úÖ Exportado correctamente: 780 registros
üìç Ruta: /content/drive/MyDrive/Laboratorio_3/regresion2-prediccion-febrero2020.csv
