In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

### ***-: UPLODING '.csv' FILE FOR USE :-***

In [None]:
from google.colab import files
print("Please upload your CSV file:")
uploaded = files.upload()
sales_data = next(iter(uploaded))

### ***-: CHENGING '.csv' INTO DATAFRAME :-***

In [None]:
df = pd.read_csv("sales_data.csv")

### ***-: BASIC DATA VARIFICATION & VALIDATION  :-***

In [None]:
# Checking Dataframe shape :-

df.shape

In [None]:
# Checking top 5 rows :-

df.head()

In [None]:
# Checking missing values

df.isna().sum()

In [None]:
# Checking structure :-

df.info()

In [None]:
# Statistical summary :-

df.describe()

### ***-: DATA CLEANING & PREPROCESSING :-***

In [None]:
#1. Handle Missing Sales Values :-

df["Sales"] = df["Sales"].interpolate(method="linear")
df.isna().sum()

In [None]:
#2. Handle Negative Sales Values :-

df["Sales"] = df["Sales"].apply(lambda x: 0 if x < 0 else x)
df.describe()

In [None]:
#3. Set Date as Time Index :-

df["Date"] = pd.to_datetime(df["Date"])
df = df.set_index("Date")
df.head()

In [None]:
#4. Ensure Daily Frequency :-

df = df.asfreq("D")
df.isna().sum()

In [None]:
#5. Quick Sanity Plot :-

plt.figure(figsize=(12, 4))
plt.plot(df.index, df["Sales"])
plt.title("Daily Sales After Cleaning")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.show()

### ***-: CONCLUSION :-***

**Data Cleaning & Time Series Preparation**

The raw sales data contained missing values and negative sales entries,
which were addressed using business logic and time-series best practices.
- Missing sales values were interpolated to preserve continuity
- Negative sales values were capped at zero to handle returns or errors
- Date column was converted to a datetime index
- Daily frequency was enforced for time series consistency

These steps ensured the dataset was suitable for reliable time series analysis
and forecasting.

### ***-: EXPLORATORY TIME SERIES ANALYSIS :-***

In [None]:
#1. Overall Sales Trend (Daily) :-

plt.figure(figsize=(12, 4))
plt.plot(df.index, df["Sales"])
plt.title("Daily Sales Trend")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.show()

In [None]:
#2. Monthly Sales Aggregation :-

monthly_sales = df["Sales"].resample("M").sum()

plt.figure(figsize=(10, 4))
plt.plot(monthly_sales.index, monthly_sales)
plt.title("Monthly Sales Trend")
plt.xlabel("Date")
plt.ylabel("Total Sales")
plt.show()

In [None]:
#3. Year-over-Year Sales Comparison :-

yearly_sales = df["Sales"].resample("Y").sum()
yearly_sales

plt.figure(figsize=(6, 4))
yearly_sales.plot(kind="bar")
plt.title("Yearly Sales Comparison")
plt.ylabel("Total Sales")
plt.show()

In [None]:
#4. Seasonal Pattern by Month :-

df["Month"] = df.index.month

monthly_pattern = df.groupby("Month")["Sales"].mean()

plt.figure(figsize=(8, 4))
sns.lineplot(x=monthly_pattern.index, y=monthly_pattern.values)
plt.title("Average Sales by Month (Seasonality)")
plt.xlabel("Month")
plt.ylabel("Average Sales")
plt.show()

In [None]:
#5. Rolling Average :-

df["Rolling_30"] = df["Sales"].rolling(window=30).mean()

plt.figure(figsize=(12, 4))
plt.plot(df["Sales"], alpha=0.4, label="Daily Sales")
plt.plot(df["Rolling_30"], color="red", label="30-Day Rolling Average")
plt.legend()
plt.title("Sales Trend with Rolling Average")
plt.show()

### ***-: CONCLUSION :-***

**Time Series EDA Summary**

Exploratory time series analysis revealed:
- A clear upward trend in sales over time
- Strong seasonal patterns with recurring monthly peaks
- Increasing yearly sales totals indicating business growth
- Short-term volatility that smooths out over longer rolling windows

These patterns confirm that the dataset is suitable for time series
forecasting and that seasonality must be considered in modeling.

### ***-: TIME SERIES DECOMPOSITION :-***

In [None]:
#1. Import Decomposition Tool :-

from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
#2. Decompose Monthly Sales :-

monthly_sales = df["Sales"].resample("M").sum()

decomposition = seasonal_decompose(
    monthly_sales,
    model="additive",
    period=12
)

In [None]:
#3. Visualize Decomposition :-

plt.figure(figsize=(12, 8))
decomposition.plot()
plt.tight_layout()
plt.show()

**Understand Each Component :**

ðŸ”¹Trend --
- Long-term business growth
- Used for planning & targets

ðŸ”¹ Seasonality --
- Repeating yearly pattern
- Helps inventory & staffing decisions

ðŸ”¹ Residual --
- Random noise
- Cannot be predicted reliably

In [None]:
#5. Check Strength of Seasonality :-

seasonal_strength = (
    np.var(decomposition.seasonal) /
    (np.var(decomposition.seasonal) + np.var(decomposition.resid.dropna()))
)

seasonal_strength

### ***-: CONCLUSION :-***

**Time Series Decomposition Summary**

Sales were decomposed into trend, seasonal, and residual components
using an additive time series model.
Key observations:
- The trend component shows consistent long-term growth
- The seasonal component reveals strong recurring yearly patterns
- Residuals represent random fluctuations not explained by trend or seasonality

This decomposition confirms that sales are driven by both growth and
seasonality, validating the use of seasonal forecasting techniques.

### ***-: FORECASTING MODELS :-***

In [None]:
#1. Prepare Monthly Time Series :-

monthly_sales = df["Sales"].resample("M").sum()
monthly_sales.head()

In [None]:
#2. Trainâ€“Test Split :-

train = monthly_sales[:-6]
test = monthly_sales[-6:]

train.tail(), test

In [None]:
#3. Baseline Model 1: Naive Forecast :-

naive_forecast = [train.iloc[-1]] * len(test)

rmse_naive = np.sqrt(mean_squared_error(test, naive_forecast))
rmse_naive

In [None]:
#4. Baseline Model 2: Moving Average Forecast :-

window = 3
moving_avg_value = train[-window:].mean()
moving_avg_forecast = [moving_avg_value] * len(test)

rmse_ma = np.sqrt(mean_squared_error(test, moving_avg_forecast))
rmse_ma

In [None]:
#5. Regression-Based Time Forecasting :-

time_index = np.arange(len(monthly_sales))
monthly_df = pd.DataFrame({
    "Sales": monthly_sales.values,
    "t": time_index
})

train_df = monthly_df.iloc[:-6]
test_df = monthly_df.iloc[-6:]

lr = LinearRegression()
lr.fit(train_df[["t"]], train_df["Sales"])

reg_forecast = lr.predict(test_df[["t"]])

rmse_lr = np.sqrt(mean_squared_error(test_df["Sales"], reg_forecast))
rmse_lr

In [None]:
#6. Compare Model Performance :-

pd.DataFrame({
    "Model": ["Naive", "Moving Average", "Linear Regression"],
    "RMSE": [rmse_naive, rmse_ma, rmse_lr]
})

In [None]:
#7. Visual Comparison of Forecasts :-

plt.figure(figsize=(10, 4))
plt.plot(train.index, train, label="Train")
plt.plot(test.index, test, label="Actual", marker="o")

plt.plot(test.index, naive_forecast, label="Naive Forecast")
plt.plot(test.index, moving_avg_forecast, label="Moving Avg Forecast")
plt.plot(test.index, reg_forecast, label="Regression Forecast")

plt.legend()
plt.title("Forecast Comparison (Last 6 Months)")
plt.show()

### ***-: CONCLUSION :-***

**Forecasting Model Comparison**

Multiple baseline forecasting approaches were evaluated:
- Naive forecast provided a minimum benchmark
- Moving average smoothed recent volatility
- Linear regression captured the long-term sales trend

Among these, linear regression demonstrated the lowest error and
best alignment with observed sales, making it suitable for short-term
planning and forecasting.

### ***-: FUTURE FORECAST :-***

In [None]:
#1. Create Future Time Index (6 Months Ahead) :-

forecast_horizon = 6

last_time_index = monthly_df["t"].iloc[-1]
future_t = np.arange(last_time_index + 1, last_time_index + forecast_horizon + 1)

future_df = pd.DataFrame({"t": future_t})
future_df

In [None]:
#2. Generate Future Sales Forecast :-

future_sales_forecast = lr.predict(future_df[["t"]])
future_sales_forecast

In [None]:
#3. Create Future Date Index :-

last_date = monthly_sales.index[-1]
future_dates = pd.date_range(
    start=last_date + pd.offsets.MonthEnd(1),
    periods=forecast_horizon,
    freq="M"
)

In [None]:
#4. Build Forecast DataFrame :-

forecast_df = pd.DataFrame({
    "Date": future_dates,
    "Forecasted_Sales": future_sales_forecast
})

forecast_df

In [None]:
#5. Visualize Historical and Forecasted Sales :-

plt.figure(figsize=(10, 4))
plt.plot(monthly_sales.index, monthly_sales, label="Historical Sales")
plt.plot(forecast_df["Date"], forecast_df["Forecasted_Sales"],
         marker="o", linestyle="--", label="Forecasted Sales")

plt.title("Sales Forecast â€“ Next 6 Months")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.legend()
plt.show()

###***-: EXPORT FORECAST FOR BUSINESS TEAMS :-***

In [None]:
forecast_df.to_csv("sales_forecast_next_6_months.csv", index=False)

from google.colab import files
files.download("sales_forecast_next_6_months.csv")

**FUTURE SALES FORECAST SUMMARY :--**

1. *Using a regression-based time series model, sales were forecasted for the
next six months beyond the available historical data.*

2. *The forecast indicates a continuation of the existing growth trend,
providing valuable input for inventory planning, revenue forecasting,
and operational decision-making.*

3. *The exported forecast file can be directly used by business teams for
planning and budgeting purposes.*

### ***-: FINAL CONCLUSION :-***

*This project demonstrates how historical sales data can be effectively analyzed and transformed into forward-looking business insights using time series techniques.*

*Exploratory analysis revealed a clear long-term upward trend in sales along with strong seasonal patterns, indicating predictable fluctuations across time periods. Time series decomposition confirmed that sales behavior is driven by both structural growth and recurring seasonality, rather than random variation alone.*

*Multiple baseline forecasting approaches were evaluated, and regression-based forecasting performed best in capturing the overall sales trend. Using this model, future sales were projected for the next six months, providing actionable inputs for inventory planning, revenue forecasting, and operational decision-making.*

*Overall, this analysis highlights the importance of structured time series exploration and modeling in enabling data-driven planning. The generated forecasts can serve as a foundation for more advanced forecasting techniques such as ARIMA, SARIMA, or Prophet in future iterations.*



---

