In [1]:
pip install pandas




In [2]:
import pandas as pd

# Load the CSV
df = pd.read_csv("data/walmart-sales-dataset-of-45stores.csv")

# Preview the data
df.head()

df.columns

Index(['Store', 'Date', 'Weekly_Sales', 'Holiday_Flag', 'Temperature',
       'Fuel_Price', 'CPI', 'Unemployment'],
      dtype='object')

In [3]:
import pandas as pd

# Convert Date and sort
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df = df.sort_values(['Store', 'Date'])

# Filter for one store (start with Store 1)
store_df = df[df['Store'] == 1].copy()

# Set index for time-series
store_df.set_index('Date', inplace=True)

# Optional: Smooth sales a bit
store_df['Weekly_Sales'] = store_df['Weekly_Sales'].rolling(2).mean().fillna(method='bfill')

# Select features
features = ['Weekly_Sales', 'Holiday_Flag', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment']
data = store_df[features]

data.head()

  store_df['Weekly_Sales'] = store_df['Weekly_Sales'].rolling(2).mean().fillna(method='bfill')


Unnamed: 0_level_0,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-02-05,1642824.17,0,42.31,2.572,211.096358,8.106
2010-02-12,1642824.17,1,38.51,2.548,211.24217,8.106
2010-02-19,1626962.805,0,39.93,2.514,211.289143,8.106
2010-02-26,1510847.88,0,46.63,2.561,211.319643,8.106
2010-03-05,1482267.135,0,46.5,2.625,211.350143,8.106


In [4]:
pip install scikit-learn




In [5]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(data)

# Sequence creator
def create_sequences(data, seq_length=4):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length])
        y.append(data[i+seq_length, 0])  # Weekly_Sales is target
    return np.array(X), np.array(y)

SEQ_LEN = 4  # Use 4 weeks to predict the next
X, y = create_sequences(scaled_data, SEQ_LEN)

X.shape, y.shape

((139, 4, 6), (139,))

In [6]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [7]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(64, input_shape=(X.shape[1], X.shape[2])))
model.add(Dense(1))

model.compile(optimizer='adam', loss='mse')
model.summary()

  super().__init__(**kwargs)


In [8]:
history = model.fit(X, y, epochs=50, batch_size=16, validation_split=0.2)

Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 319ms/step - loss: 0.0824 - val_loss: 0.0065
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.0273 - val_loss: 0.0273
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.0320 - val_loss: 0.0092
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0251 - val_loss: 0.0051
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.0231 - val_loss: 0.0052
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.0245 - val_loss: 0.0075
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0267 - val_loss: 0.0067
Epoch 8/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.0165 - val_loss: 0.0053
Epoch 9/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [9]:
# Use last SEQ_LEN points to predict next N weeks
def forecast_future_sales(model, last_sequence, N, scaler):
    forecast = []
    seq = last_sequence.copy()

    for _ in range(N):
        pred = model.predict(seq[np.newaxis, :, :])[0, 0]
        forecast.append(pred)

        # Create new row keeping non-target features from previous step
        new_row = seq[-1].copy()
        new_row[0] = pred  # Replace Weekly_Sales only

        # Update sequence
        seq = np.concatenate((seq[1:], [new_row]), axis=0)

    # Inverse transform (only Weekly_Sales column)
    forecast_scaled = np.array(forecast).reshape(-1, 1)
    inverse = scaler.inverse_transform(
        np.hstack([forecast_scaled, np.zeros((N, len(features)-1))])
    )[:, 0]
    
    return inverse

last_seq = X[-1]
predicted_sales = forecast_future_sales(model, last_seq, N=7, scaler=scaler)
print(predicted_sales)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 459ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1628260.35445315 1642353.92030333 1649329.1131968  1654910.35877847
 1649667.08927325 1646783.05111282 1644793.58386465]


In [10]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
import joblib

# Convert date
df['Date'] = pd.to_datetime(df['Date'], dayfirst=True)
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['Is_Weekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

# Simulate realistic weather features
np.random.seed(42)
df['Humidity'] = np.random.normal(loc=60, scale=10, size=len(df)).clip(20, 100)
df['Rainfall'] = np.random.exponential(scale=3.0, size=len(df)).clip(0, 50)


# Features and target
features = [
    'Temperature',
    'Fuel_Price',
    'CPI',
    'Unemployment',
    'Holiday_Flag',
    'Month',
    'DayOfWeek',
    'Is_Weekend'
]

X = df[features]
y = df['Weekly_Sales']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"✅ RMSE: ₹{rmse:,.2f}")
print(f"✅ R² Score: {r2:.3f}")

✅ RMSE: ₹464,650.42
✅ R² Score: 0.330


In [14]:
# Save model for Streamlit app
joblib.dump(model, 'weather_model.pkl')

['weather_model.pkl']

In [15]:
import requests
import pandas as pd

api_key = ""
location = "Dallas,TX"
url = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{location}/next7days?unitGroup=metric&include=days&key={api_key}&contentType=json"

response = requests.get(url)
data = response.json()

# Extract 7-day forecast into DataFrame
forecast = pd.DataFrame(data['days'])[['datetime', 'temp']]
forecast.columns = ['Date', 'Temperature']

forecast['Date'] = pd.to_datetime(forecast['Date'])
forecast

Unnamed: 0,Date,Temperature
0,2025-07-01,29.2
1,2025-07-02,30.4
2,2025-07-03,30.0
3,2025-07-04,29.4
4,2025-07-05,30.6
5,2025-07-06,31.1
6,2025-07-07,30.1
7,2025-07-08,30.2


In [18]:
import requests
import pandas as pd

def get_country_holidays(year, country_code):
    API_KEY = ""
    url = f"https://calendarific.com/api/v2/holidays?&api_key={API_KEY}&country={country_code}&year={year}"
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed for {country_code} {year}")
        return []
    
    holidays = response.json().get('response', {}).get('holidays', [])
    dates = [h['date']['iso'] for h in holidays if h['type'][0] in ['National holiday', 'Religious', 'Observance']]
    return dates

In [19]:
def get_all_walmart_holidays():
    countries = ['US', 'MX', 'CA', 'CL', 'IN', 'ZA', 'AR', 'BR']
    all_dates = set()

    for year in [2010, 2011, 2012]:
        for country in countries:
            try:
                print(f"Fetching {country} holidays for {year}")
                dates = get_country_holidays(year, country)
                all_dates.update(pd.to_datetime(dates))
            except Exception as e:
                print(f"Error for {country}-{year}: {e}")
    
    return pd.to_datetime(list(all_dates))

In [20]:
df['Date'] = pd.to_datetime(df['Date'])

# Get holiday dates
holiday_dates = get_all_walmart_holidays()

# Create Is_Festival feature
df['Is_Festival'] = df['Date'].isin(holiday_dates).astype(int)

# Save updated file
df.to_csv("data/walmart_data_with_festivals.csv", index=False)
print("✅ Festival feature added and file saved.")

Fetching US holidays for 2010
Fetching MX holidays for 2010
Fetching CA holidays for 2010
Fetching CL holidays for 2010
Error for CL-2010: unconverted data remains when parsing with format "%Y-%m-%d": "T07:28:24-04:00", at position 8. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.
Fetching IN holidays for 2010
Fetching ZA holidays for 2010
Fetching AR holidays for 2010
Fetching BR holidays for 2010
Fetching US holidays for 2011
Fetching MX holidays for 2011
Fetching CA holidays for 2011
Fetching CL holidays for 2011
Error for CL-2011: unconverted data remains when parsing with format "%Y-%m-%d": "T13:16:30-04:00", at position 7. You might want to try:
    - passing `format` if your strings hav

In [22]:
import pandas as pd

df = pd.read_csv("data/walmart_data_with_festivals.csv")

# Convert Date column to datetime format
df['Date'] = pd.to_datetime(df['Date'])

# Drop any missing values (optional, depending on your dataset)
df = df.dropna()

In [23]:
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['Is_Weekend'] = df['DayOfWeek'].apply(lambda x: 1 if x >= 5 else 0)

In [34]:
from sklearn.model_selection import train_test_split

features = ['Is_Festival', 'Month', 'DayOfWeek', 'Is_Weekend', 'Fuel_Price', 'CPI', 'Unemployment', 'Temperature']
target = 'Weekly_Sales'

X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [35]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"✅ RMSE: ₹{rmse:,.2f}")
print(f"🎯 R² Score: {r2:.4f}")

✅ RMSE: ₹470,692.51
🎯 R² Score: 0.3123


In [36]:
import joblib

joblib.dump(model, "festival_model.pkl")
print("✅ Model saved as 'festival_model.pkl'")

✅ Model saved as 'festival_model.pkl'
