##DATA SCRAPPING AND IMPORTING

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install faker

Collecting faker
  Downloading Faker-28.4.1-py3-none-any.whl.metadata (15 kB)
Downloading Faker-28.4.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-28.4.1


In [None]:
import json
import random
from faker import Faker

# Initialize Faker
fake = Faker()

# Define the number of rows
num_rows = 200000

# Define the structure of the JSON data
data = []

# Generate the data
for i in range(1, num_rows + 1):
    row = {
        "ID": i,
        "Store_id": random.randint(1, 1000),
        "Store_Type": random.choice(["Supermarket", "Grocery", "Hypermarket", "Convenience"]),
        "Location_Type": random.choice(["Urban", "Suburban", "Rural"]),
        "Region_Code": fake.state_abbr(),
        "Date": fake.date_this_decade().strftime("%Y-%m-%d"),
        "Holiday": random.choice([True, False]),
        "Discount": random.choice([True, False]),
        "#Order": random.randint(1, 1000),
        "Sales": round(random.uniform(10, 1000), 2)
    }
    data.append(row)

# Save the data to a JSON file
with open("store_data.json", "w") as json_file:
    json.dump(data, json_file, indent=4)

print(f"{num_rows} rows of data have been generated and saved to 'store_data.json'")


200000 rows of data have been generated and saved to 'store_data.json'


In [None]:
import pandas as pd

# Sample JSON data
json_data = pd.read_json("/content/store_data.json")

# Convert JSON to DataFrame
df = pd.DataFrame(json_data)

# Save DataFrame to Excel file
excel_file_path = "output.xlsx"
df.to_excel(excel_file_path, index=False)

print(f"JSON data has been converted to {excel_file_path}")


JSON data has been converted to output.xlsx


##DATA PREPROCESSING

In [None]:
import pandas as pd
import numpy as np


df = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/DATASET/output.xlsx")

In [None]:
df.head(2)

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,1,709,Grocery,Suburban,NM,2022-10-31,True,True,661,750.92
1,2,988,Convenience,Rural,MP,2023-10-05,False,True,201,854.04


In [None]:
df.shape

(200000, 10)

In [None]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Store_id,0
Store_Type,0
Location_Type,0
Region_Code,0
Date,0
Holiday,0
Discount,0
#Order,0
Sales,0


**{'Supermarket': 1, 'Grocery': 2, 'Hypermarket': 3, 'Convenience': 4}**

In [None]:
df['Store_Type'] = df['Store_Type'].map({'Supermarket': 1, 'Grocery': 2, 'Hypermarket': 3, 'Convenience': 4})

In [None]:
df['Location_Type'] = df['Location_Type'].map({'Urban': 1, 'Suburban': 2, 'Rural': 3})
df['Holiday'] = df['Holiday'].map({True: 1, False: 0})

In [None]:
df.head(2)

Unnamed: 0,ID,Store_id,Store_Type,Location_Type,Region_Code,Date,Holiday,Discount,#Order,Sales
0,1,709,2,2,NM,2022-10-31,1,True,661,750.92
1,2,988,4,3,MP,2023-10-05,0,True,201,854.04


In [None]:
x = pd.DataFrame(df[['Store_Type','Location_Type','Holiday','Discount']])
y = pd.DataFrame(df['#Order'])

In [None]:
x.head(2)

Unnamed: 0,Store_Type,Location_Type,Holiday,Discount
0,2,2,1,True
1,4,3,0,True


##ML ALGO

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=42,test_size=0.2)

In [None]:
import lightgbm as lgb
model = lgb.LGBMRegressor()
model.fit(x_train,y_train)

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.014819 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13
[LightGBM] [Info] Number of data points in the train set: 160000, number of used features: 4
[LightGBM] [Info] Start training from score 501.193425


In [None]:
y_prediction = model.predict(x_test)
data = pd.DataFrame(data = {"predicted orders":y_prediction.flatten()})
print(data.head(2))

   predicted orders
0        496.723905
1        508.435793


In [None]:
data

Unnamed: 0,predicted orders
0,496.723905
1,508.435793
2,498.375799
3,494.301466
4,492.033273
...,...
39995,500.973706
39996,498.073408
39997,498.673737
39998,494.301466
