<a href="https://colab.research.google.com/github/animeqvin/Kumis/blob/main/Business_kumis_pred.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

In [43]:
def load_data():
    df = pd.read_excel('Kumis_Sales_combined.xlsx')
    return df

In [44]:
def preprocess_data(df):

    if not pd.api.types.is_datetime64_any_dtype(df['Date']):
        df['Date'] = pd.to_datetime(df['Date'], origin='1899-12-30', unit='D')


    df['Year'] = df['Date'].dt.year
    df['Month'] = df['Date'].dt.month
    df['DayOfMonth'] = df['Date'].dt.day
    df['DayOfYear'] = df['Date'].dt.dayofyear


    le = LabelEncoder()
    df['DayOfWeek_Encoded'] = le.fit_transform(df['Day_of_Week'])


    df['Sales_Lag1'] = df['Kumis_Sales_Volume_L'].shift(1)
    df['Sales_Lag2'] = df['Kumis_Sales_Volume_L'].shift(2)


    df = df.ffill()


    features = ['Sales_Lag1', 'Sales_Lag2', 'Avg_Daily_Temp_C', 'Holiday_KZ',
                'Year', 'Month', 'DayOfMonth', 'DayOfWeek_Encoded', 'DayOfYear']
    target = 'Kumis_Sales_Volume_L'

    return df[features], df[target], le


In [45]:
def calculate_accuracy(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    mean_actual = np.mean(y_true)
    percentage_error = (mae / mean_actual) * 100
    percentage_accuracy = 100 - percentage_error

    print("\n" + "="*50)
    print("MODEL PERFORMANCE METRICS")
    print("="*50)
    print(f"Mean Absolute Error (MAE): {mae:.2f} liters")
    print(f"Root Mean Squared Error (RMSE): {rmse:.2f} liters")
    print(f"R-squared (R²) Score: {r2:.4f}")
    print(f"Percentage Error: {percentage_error:.2f}%")
    print(f"Percentage Accuracy: {percentage_accuracy:.2f}%")
    print("="*50)


    return {
        'MAE': mae,
        'RMSE': rmse,
        'R2': r2,
        'Percentage_Error': percentage_error,
        'Percentage_Accuracy': percentage_accuracy
    }

In [46]:
def train_model(X, y):

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    trend_model = LinearRegression()
    trend_model.fit(X_train[['Year']], y_train)
    y_trend_pred = trend_model.predict(X_train[['Year']])
    residuals = y_train - y_trend_pred


    param_grid = {
        'n_estimators': [50, 100],
        'max_depth': [3, 5, 7],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    rf = RandomForestRegressor(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train.drop('Year', axis=1), residuals)
    best_rf_model = grid_search.best_estimator_


    y_trend_test_pred = trend_model.predict(X_test[['Year']])
    residual_test_pred = best_rf_model.predict(X_test.drop('Year', axis=1))
    y_pred = y_trend_test_pred + residual_test_pred

    accuracy = calculate_accuracy(y_test, y_pred)

    return trend_model, best_rf_model, accuracy

In [47]:
def predict_sales(trend_model, rf_model, sales_lag1, sales_lag2, temp, holiday, date, day_of_week, label_encoder):
    date = pd.to_datetime(date)
    input_data = {
        'Sales_Lag1': sales_lag1,
        'Sales_Lag2': sales_lag2,
        'Avg_Daily_Temp_C': temp,
        'Holiday_KZ': holiday,
        'Year': date.year,
        'Month': date.month,
        'DayOfMonth': date.day,
        'DayOfWeek_Encoded': label_encoder.transform([day_of_week])[0],
        'DayOfYear': date.timetuple().tm_yday
    }
    input_df = pd.DataFrame([input_data])

    trend_pred = trend_model.predict(input_df[['Year']])[0]
    residual_pred = rf_model.predict(input_df.drop('Year', axis=1))[0]
    return trend_pred + residual_pred

In [48]:
if __name__ == "__main__":

    df = load_data()
    X, y, label_encoder = preprocess_data(df)


    trend_model, rf_model, accuracy_metrics = train_model(X, y)


    prediction_date = '2024-05-09'
    day_of_week = 'Monday'
    sales_lag1 = 27.0
    sales_lag2 = 15.0
    temp = 19.0
    holiday = 1

    predicted_sales = predict_sales(
        trend_model, rf_model,
        sales_lag1, sales_lag2, temp, holiday,
        prediction_date, day_of_week, label_encoder
    )

    print(f"\nPredicted Kumis Sales for {prediction_date}: {predicted_sales:.1f} liters")


MODEL PERFORMANCE METRICS
Mean Absolute Error (MAE): 1.16 liters
Root Mean Squared Error (RMSE): 1.37 liters
R-squared (R²) Score: 0.5348
Percentage Error: 14.99%
Percentage Accuracy: 85.01%

Predicted Kumis Sales for 2024-05-09: 14.6 liters
