In [64]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score


In [65]:
# Load the dataset
data = pd.read_excel('C:/Users/ASUS/OneDrive/Desktop/Drug_Data_Featured.xlsx')  # Replace with your file path

# Save a copy of the original columns before encoding
original_columns = data[['Disease Category', 'Drug Category', 'Drug Name', 'Dosage','Retail Price', 'Purchase Price', 'Sales','Date','Month','Lag_1','Lag_2','Mean Sale','CV','Buffer Percentage','Buffer Stock']]

# One-Hot Encoding for categorical columns
data_encoded = pd.get_dummies(data, columns=['Disease Category', 'Drug Category', 'Drug Name', 'Dosage'], drop_first=True)

# Features and target
X = data_encoded.drop(columns=["Sales"])  # Exclude target column
y = data_encoded["Sales"]


In [71]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Get the indices of the test set
test_indices = X_test.index

# Reset indices to avoid alignment issues
X_test.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

# Filter the original dataset to match the test set
original_test_data = original_columns.iloc[test_indices].copy()

original_test_data.reset_index(drop=True, inplace=True)

# Copy test data for future prediction
X_test_future = X_test.copy()


In [72]:
# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)




In [73]:
# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R² Score: {r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")

from sklearn.model_selection import cross_val_score, KFold

# Perform 5-Fold Cross Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Use negative MSE as the scoring metric
cv_mse = cross_val_score(rf_model, X_test, y_test, cv=kf, scoring='neg_mean_squared_error')
cv_r2 = cross_val_score(rf_model, X_test, y_test, cv=kf, scoring='r2')

# Take the mean and standard deviation of the cross-validation scores
mean_mse = -cv_mse.mean()
std_mse = cv_mse.std()
mean_r2 = cv_r2.mean()
std_r2 = cv_r2.std()

print(f"Cross-Validation Mean MSE: {mean_mse}")
print(f"Cross-Validation MSE Std. Dev: {std_mse}")
print(f"Cross-Validation Mean R²: {mean_r2}")
print(f"Cross-Validation R² Std. Dev: {std_r2}")





Mean Squared Error: 644293.8329909696
Mean Absolute Error: 305.17893674609724
Root Mean Squared Error (RMSE): 802.6791594348078
R² Score: 0.9554989705344631
Mean Absolute Percentage Error (MAPE): 58.71%
Cross-Validation Mean MSE: 1116498.582287946
Cross-Validation MSE Std. Dev: 484451.23791422095
Cross-Validation Mean R²: 0.9243206656946455
Cross-Validation R² Std. Dev: 0.026642327866721792


In [75]:
# Predict for the next 6 months
future_predictions = []

for month in range(1, 7):  # Predict for 6 months
    # Predict sales for the current month
    predictions = rf_model.predict(X_test_future)
    future_predictions.append(predictions)

    # Update lag features for the next iteration
    X_test_future["Lag_2"] = X_test_future["Lag_1"]
    X_test_future["Lag_1"] = predictions

# Combine predictions into a DataFrame
future_sales = pd.DataFrame(future_predictions).T
future_sales.columns = [f"Month_{i}_Sales" for i in range(1, 7)]

# Ensure the number of rows matches X_test
assert future_sales.shape[0] == X_test.shape[0], "Mismatch in rows between future_sales and X_test!"


In [76]:
# Ensure indices are aligned between future_sales and original test data
future_sales.reset_index(drop=True, inplace=True)

# Merge predictions with original columns
predicted_data = pd.concat([original_columns, future_sales], axis=1)

# Preview the first few rows of the combined DataFrame
print(predicted_data.head())

# Save the final dataset to an Excel file
output_file_path = "C:/Users/ASUS/OneDrive/Desktop/rf_predicted_sales_final.xlsx"
predicted_data.to_excel(output_file_path, index=False)

print(f"Predicted sales have been saved successfully to '{output_file_path}'!")

  Disease Category Drug Category              Drug Name Dosage  Retail Price  \
0   Cardiovascular       AMLODIP            AMLODAC 5MG    5MG        283.60   
1   Cardiovascular       AMLODIP            AMLODAC 5MG    5MG       3100.70   
2   Cardiovascular       AMLODIP            AMLONG 10MG   10MG      59050.80   
3   Cardiovascular       AMLODIP  AMLONG 2. 2. 2. 2.5MG  2.5MG     136632.10   
4   Cardiovascular       AMLODIP             AMLONG 5MG    5MG     358590.16   

   Purchase Price    Sales  Date  Month  Lag_1  ...  Mean Sale         CV  \
0           325.6     54.0  2.24      1    NaN  ...     443.90  41.140192   
1          2701.6    307.0  2.24      1   54.0  ...     443.90  41.140192   
2         50814.4   1396.0  2.24      1    NaN  ...    1532.40  12.067615   
3        117638.4   6127.0  2.24      1    NaN  ...    7115.20   8.837141   
4        353840.8  16855.0  2.24      1    NaN  ...   17535.96   2.237007   

   Buffer Percentage  Buffer Stock  Month_1_Sales  Month