In [1]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
df1 = pd.read_csv("train_ready.csv", sep=",")

In [3]:
df = df1.copy(deep=False)

df

Unnamed: 0,ID,id_season,family,fabric,color_name,length_type,silhouette_type,print_type,moment,num_stores,num_sizes,price,num_week_iso,weekly_demand
0,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,1.0,69.0
1,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,2.0,112.0
2,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,3.0,135.0
3,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,4.0,99.0
4,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,5.0,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81615,12767.0,87.0,Coats,WOVEN,NEGRO,Long,Straight,Sin Estampado,FORMAL WORK,599.0,7.0,159.99,47.0,82.0
81616,12767.0,87.0,Coats,WOVEN,NEGRO,Long,Straight,Sin Estampado,FORMAL WORK,599.0,7.0,159.99,48.0,324.0
81617,12767.0,87.0,Coats,WOVEN,NEGRO,Long,Straight,Sin Estampado,FORMAL WORK,599.0,7.0,159.99,49.0,694.0
81618,12767.0,87.0,Coats,WOVEN,NEGRO,Long,Straight,Sin Estampado,FORMAL WORK,599.0,7.0,159.99,50.0,441.0


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [5]:
X = df.drop(columns=["weekly_demand"])
y = df["weekly_demand"]

cat = X.select_dtypes(["object","category"]).columns
num = X.select_dtypes("number").columns

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num)
])

model = Pipeline([
    ("prep", pre),
    ("lin", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(y_pred[:5])


y_pred = model.predict(X_test)
avg_12 = y_pred[:12].mean()
print(avg_12)

[ 253.16301042 1540.19735945  659.26183618 1353.79109723 2991.3682669 ]
1315.5784446472312


In [6]:
!pip install lightgbm



In [7]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

X = df.drop(columns=["weekly_demand", "ID"])
y = df["weekly_demand"]

cat = X.select_dtypes(["object","category"]).columns
num = X.select_dtypes("number").columns

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num)
])

model = Pipeline([
    ("prep", pre),
    ("lgbm", LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        random_state=42,
        n_jobs=-1
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, y_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 739
[LightGBM] [Info] Number of data points in the train set: 65296, number of used features: 186
[LightGBM] [Info] Start training from score 1208.456536
MSE: 732332.6470430971




In [8]:
# training on everything

from lightgbm import LGBMRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd

X = df.drop(columns=["weekly_demand", "ID"])
y = df["weekly_demand"]

cat = X.select_dtypes(["object","category"]).columns
num = X.select_dtypes("number").columns

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num)
])

model = Pipeline([
    ("prep", pre),
    ("lgbm", LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        random_state=42,
        n_jobs=-1
    ))
])

# fit on ALL training data
model.fit(X, y)

# apply to test.csv
test = pd.read_csv("test_ready.csv")
X_test = test.drop(columns=["ID"])
y_pred = model.predict(X_test)

sub = pd.DataFrame({
    "ID": test["ID"],
    "demand": y_pred
})

sub = sub.groupby("ID", as_index=False, sort=False)["demand"].sum()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 751
[LightGBM] [Info] Number of data points in the train set: 81620, number of used features: 192
[LightGBM] [Info] Start training from score 1203.458049




In [9]:
sub['ID'] = sub['ID'].astype(str)

In [10]:
(sub < 0).any()

TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
sub[sub < 0] = 0

In [11]:
sub.to_csv("submission.csv", sep=",", index=False)

In [None]:
sub.dtypes

In [None]:
(sub < 0).any()

In [None]:
(sub == 0).any()

In [None]:
sub.isna().any()

In [None]:
sub['demand'] = sub['demand'].replace(0, 1000)