In [13]:
#Mounting google drive to colab

from google.colab import drive
drive.mount('/content/drive')






Mounted at /content/drive


In [21]:
#importing numpy

import numpy as np
from datetime import datetime

In [22]:
#Data Directory setup and loading

#& importing pandas for data analysis

import pandas as pd
from pathlib import Path

DATA_DIR = Path("/content/drive/MyDrive/Personal projects/Walmart ")
OUT_DIR = DATA_DIR.parent / "Walmart_outputs"
OUT_DIR.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA_DIR / "train.csv", parse_dates=["Date"])

test = pd.read_csv(DATA_DIR / "test.csv", parse_dates=["Date"])

features = pd.read_csv(DATA_DIR / "features.csv", parse_dates=["Date"])

stores = pd.read_csv(DATA_DIR / "stores.csv")

In [23]:
#Data Exploration

train.columns = train.columns.str.strip()
stores.columns = stores.columns.str.strip()
features.columns = features.columns.str.strip()

# run this line of code to make sure any non numeric value is converted to NaN in weekly sales

train["Weekly_Sales"] = pd.to_numeric(train["Weekly_Sales"],errors="coerce")


for c in ["Temperature","Fuel_Price","CPI","Unemployment","MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]:
  if c in features.columns:
    features[c] = pd.to_numeric(features[c], errors="coerce")

for c in ["Markdown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"]:
  if c in features.columns:
    features[c] = features[c].fillna(0)




In [27]:
#time features for grouping

for frame in (train,features):
  frame["Year"] = frame["Date"].dt.year
  frame["Month"] =frame["Date"].dt.month
  frame["Month_Name"] = frame["Date"].dt.month_name()
  frame["Week"] = frame["Date"].dt.month_name()
  frame["Week"] = frame["Date"].dt.isocalendar().week.astype(int)
  frame["Quarter"] = frame["Date"].dt.quarter


  #Merging sales with external features and store attributes

  merged = train.merge(features, on=["Store","Date","IsHoliday"], how="left")
  final_data = merged.merge(stores, on="Store", how="left")


# BI

final_data["SalesperSqft"] = final_data["Weekly_Sales"]/final_data["Size"]
md_cols = [c for c in ["MarkDown1","MarkDown2","MarkDown3","MarkDown4","MarkDown5"] if c in final_data.columns]
final_data["Total_Markdowns"] = final_data[md_cols].fillna(0).sum(axis=1) if md_cols else 0
final_data["Holiday_Period"] = final_data["IsHoliday"].map({True : "Holiday",False:"Regular"}).astype("category")


In [30]:
#Exporting analysis ready dataset

# exporting two files, one with all the cleaned data and then another one for a quick view-metadata

final_csv = OUT_DIR/ "walmart_sales_clean.csv"
final_data.to_csv(final_csv, index = False)

data_dictionary = {
    "Store": "Store number (1-45)",
    "Dept": "Department number within store",
    "Date": "Week ending date (datetime)",
    "Weekly_Sales": "Weekly sales for store-department",
    "IsHoliday": "Holiday week flag (True/False)",
    "Temperature": "Avg temperature in the week",
    "Fuel_Price": "Regional fuel price",
    "MarkDown1-5": "Anonymized promotional markdowns",
    "CPI": "Consumer Price Index",
    "Unemployment": "Regional unemployment rate",
    "Type": "Store type (A/B/C)",
    "Size": "Store size (sq ft)",
    "Year": "Year extracted from Date",
    "Month": "Month number (1-12)",
    "Month_Name": "Month name",
    "Week": "ISO week number",
    "Quarter": "Quarter (1-4)",
    "Sales_Per_SqFt": "Weekly_Sales / Size",
    "Total_Markdowns": "Sum of MarkDown1..5",
    "Holiday_Period": "Holiday vs Regular label",
}

pd.DataFrame(list(data_dictionary.items()), columns =["Column","Description"])\
.to_csv(OUT_DIR / "data_dictionary.csv", index = False)