In [83]:
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
import seaborn as sns
import matplotlib.pyplot as plt


In [84]:
df1 = pd.read_csv("train_ready.csv", sep=",")

In [85]:
df = df1.copy(deep=False)

df

Unnamed: 0,ID,id_season,family,fabric,color_name,length_type,silhouette_type,print_type,moment,num_stores,num_sizes,price,num_week_iso,weekly_demand
0,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,1.0,69.0
1,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,2.0,112.0
2,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,3.0,135.0
3,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,4.0,99.0
4,1.0,86.0,Dresses,WOVEN,AMARILLO,Long,Evase,Sin Estampado,TIME OFF,152.0,5.0,35.99,5.0,74.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81615,12767.0,87.0,Coats,WOVEN,NEGRO,Long,Straight,Sin Estampado,FORMAL WORK,599.0,7.0,159.99,47.0,82.0
81616,12767.0,87.0,Coats,WOVEN,NEGRO,Long,Straight,Sin Estampado,FORMAL WORK,599.0,7.0,159.99,48.0,324.0
81617,12767.0,87.0,Coats,WOVEN,NEGRO,Long,Straight,Sin Estampado,FORMAL WORK,599.0,7.0,159.99,49.0,694.0
81618,12767.0,87.0,Coats,WOVEN,NEGRO,Long,Straight,Sin Estampado,FORMAL WORK,599.0,7.0,159.99,50.0,441.0


In [86]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

In [87]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

X = df.drop(columns=["weekly_demand"])
y = df["weekly_demand"]

cat = X.select_dtypes(["object","category"]).columns
num = X.select_dtypes("number").columns

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num)
])

model = Pipeline([
    ("prep", pre),
    ("lin", LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("mse:", mse)
print("rmse:", rmse)
print("mae:", mae)
print("r2:", r2)

mse: 1228122.733710968
rmse: 1108.2069904629586
mae: 684.3001803297099
r2: 0.45895803241179745


In [88]:
!pip install lightgbm



In [89]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

X = df.drop(columns=["weekly_demand","ID"])
y = df["weekly_demand"]

cat = X.select_dtypes(["object","category"]).columns
num = X.select_dtypes("number").columns

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num)
])

model = Pipeline([
    ("prep", pre),
    ("lgbm", LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        random_state=42,
        n_jobs=-1
    ))
])

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test,y_pred)
r2 = r2_score(y_test,y_pred)

print("mse:",mse)
print("rmse:",rmse)
print("mae:",mae)
print("r2:",r2)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 739
[LightGBM] [Info] Number of data points in the train set: 65296, number of used features: 186
[LightGBM] [Info] Start training from score 1208.456536
mse: 732332.6470430971
rmse: 855.7643642049469
mae: 502.6272703834896
r2: 0.6773753262525936




In [90]:
# training on everything

from lightgbm import LGBMRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd

X = df.drop(columns=["weekly_demand", "ID"])
y = df["weekly_demand"]

cat = X.select_dtypes(["object","category"]).columns
num = X.select_dtypes("number").columns

pre = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat),
    ("num", "passthrough", num)
])

model = Pipeline([
    ("prep", pre),
    ("lgbm", LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        random_state=42,
        n_jobs=-1
    ))
])

# fit on all training data
model.fit(X, y)

# apply to test.csv
test = pd.read_csv("test_ready.csv")
X_test = test.drop(columns=["ID"])
y_pred = model.predict(X_test)

sub = pd.DataFrame({
    "ID": test["ID"],
    "Production": y_pred
})

sub = sub.groupby("ID", as_index=False, sort=False)["Production"].sum()


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004340 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 751
[LightGBM] [Info] Number of data points in the train set: 81620, number of used features: 192
[LightGBM] [Info] Start training from score 1203.458049




In [91]:
sub[sub < 0] = 0

In [92]:
sub["Production"] = sub["Production"] * 1.225

In [93]:
sub.to_csv("submission.csv", sep=",", index=False)

In [94]:
data = {
    "Model": ["Linear Regression", "LightGBM"],
    "MSE": [1225594.8193081915, 732332.6470430971],
    "RMSE": [1107.0658604203236, 855.7643642049469],
    "MAE": [684.1056616683791, 502.6272703834896],
    "R2": [0.4600716896586101, 0.6773753262525936]
}

df_results = pd.DataFrame(data)
df_results

Unnamed: 0,Model,MSE,RMSE,MAE,R2
0,Linear Regression,1225595.0,1107.06586,684.105662,0.460072
1,LightGBM,732332.6,855.764364,502.62727,0.677375
