In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.stats import zscore
import datetime as dt
from sklearn.model_selection import train_test_split,learning_curve,GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import Ridge, Lasso
import joblib
from statsmodels.tsa.arima.model import ARIMA
import itertools
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from prophet import Prophet
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor

In [5]:

# Load the dataset
df = pd.read_csv("C:/Users/user/Desktop/Superstore sales.csv")

# Display the first few rows of the dataframe
print(df.head())

# Get information about the dataset
print(df.info())


print(df.shape)


# Summary statistics of the dataset
print(df.describe(include='all'))

   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
1       2  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
2       3  CA-2017-138688  12/06/2017  16/06/2017    Second Class    DV-13045   
3       4  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   
4       5  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   

     Customer Name    Segment        Country             City       State  \
0      Claire Gute   Consumer  United States        Henderson    Kentucky   
1      Claire Gute   Consumer  United States        Henderson    Kentucky   
2  Darrin Van Huff  Corporate  United States      Los Angeles  California   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   

   Postal Code Region       Product ID         Cat

In [None]:
df.columns

## Handling Missing Values

In [None]:
# Check for missing values in the dataset
print(df.isnull().sum())

# Visualizing missing values


plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

# Drop or impute missing values if necessary
# Example: Dropping rows with missing values
df = df.dropna()

## Data Types and Conversions

In [None]:
# Convert 'Order Date' and 'Ship Date' to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'], dayfirst = True)
df['Ship Date'] = pd.to_datetime(df['Ship Date'], dayfirst = True)

# Check data types again after conversion
print(df.dtypes)


 ## Exploratory Data Analysis (EDA)

### Conceptual Overview of Exploratory Data Analysis (EDA)

Exploratory Data Analysis (EDA) is a crucial step in understanding and interpreting the data we collect. Here’s a brief overview of its significance:

#### 1. Purpose of EDA
EDA involves examining the data to uncover patterns, spot anomalies, test hypotheses, and check assumptions with the help of summary statistics and graphical representations.

---

#### 2. Key Objectives
- **Data Quality Assessment**: Identify and address missing values, outliers, and inconsistencies.
- **Pattern Detection**: Discover relationships and trends within the data that can inform business strategies.
- **Hypothesis Generation**: Develop new hypotheses and insights based on initial findings from the data.

---
#### 3. Techniques Used
- **Descriptive Statistics**: Summarize data using measures such as mean, median, and standard deviation.
- **Visualization**: Utilize charts and graphs like histograms, box plots, and scatter plots to visually explore data.
- **Correlation Analysis**: Assess the relationships between different variables to identify potential drivers of observed patterns.
---


#### 4. Business Implications
EDA provides a solid foundation for data-driven decision-making by:
- Enhancing our understanding of underlying data structures.
- Informing the design of further analyses and modeling approaches.
- Highlighting key areas of focus for improving business performance.

---


Through EDA, we gain critical insights that help us make more informed and effective business decisions, ultimately driving better outcomes and achieving our strategic goals.


 ### Univariate Analysis
 #### Conceptual Overview of Univariate Analysis
 Univariate analysis is a fundamental statistical method that examines a single variable in isolation to understand its characteristics, distribution, and behavior. It is often the first step in data analysis, providing insights into the data structure and helping to identify trends, patterns, or anomalies.

In [None]:
# Plotting the distribution of Sales
plt.figure(figsize=(10, 6))
sns.histplot(df["Sales"], kde=True)
plt.title("Distribution of Sales")
plt.xlabel("Sales")
plt.ylabel("Frequency")
plt.show()

# Analyzing categorical variables (e.g., Segment, Category, Ship Mode)
categorical_cols = ["Segment", "Category", "Ship Mode", "Region", "Sub-Category"]

for col in categorical_cols:
    plt.figure(figsize=(10, 6))
    ax = sns.countplot(data=df, x=col, palette="viridis")
    plt.title(f'Distribution of {col}')
    plt.xlabel(col)
    plt.ylabel("Count")
    
    # Rotate x-axis labels for the 'Sub-Category' column
    if col == "Sub-Category":
        ax.set_xticklabels(ax.get_xticklabels(), rotation=60)
    
    plt.show()


### Bivariate Analysis
#### Conceptual Overview of Bivariate Analysis
Bivariate analysis is a statistical technique used to analyze the relationship between two variables. It provides insights into how one variable changes in response to changes in another and is essential for identifying correlations, trends, and patterns. This type of analysis helps to understand the association, dependency, or interaction between the two variables.

In [None]:
# Relationship between Sales and Order Date
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x="Order Date", y="Sales", ci=None)
plt.title("Sales Over Time")
plt.xlabel("Order Date")
plt.ylabel("Sales")
plt.show()

# Sales distribution across different categories
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x="Category", y="Sales", palette="viridis")
plt.title("Sales Distribution by Category")
plt.xlabel("Category")
plt.ylabel("Sales")
plt.show()

# Analyzing Sales by Segment and Region
plt.figure(figsize=(12, 6))
sns.barplot(data=df, x="Segment", y="Sales", hue="Region", palette="viridis")
plt.title("Sales by Segment and Region")
plt.xlabel("Segment")
plt.ylabel("Sales")
plt.show()


### Time Series Analysis

### Conceptual Overview of Time Series Analysis 

Time series analysis is a powerful statistical tool used to analyze and interpret data points collected or recorded at specific time intervals. Here's a brief overview tailored for our stakeholders:

---

#### What is Time Series Analysis?
- **Definition**: Time series analysis involves examining data points gathered over time to identify patterns, trends, and seasonal variations.
- **Objective**: The primary goal is to understand the underlying structure of the data, forecast future values, and make informed business decisions based on these insights.

---

#### Key Components of Time Series Analysis
1. **Trend**: Represents the long-term movement or direction in the data. It helps us understand if our sales are generally increasing, decreasing, or remaining stable over time.
2. **Seasonality**: Captures recurring patterns or cycles in the data at regular intervals (e.g., monthly, quarterly). Identifying seasonality helps in planning for periods of high and low demand.
3. **Residuals**: The irregular or random component that remains after removing the trend and seasonal effects. Analyzing residuals can highlight anomalies or events that affect sales unexpectedly.

---

#### Importance for Our Business
- **Forecasting**: By understanding trends and seasonality, we can accurately forecast future sales, enabling better inventory management and financial planning.
- **Strategic Planning**: Insights from time series analysis support strategic decision-making in marketing, resource allocation, and operational efficiency.
- **Anomaly Detection**: Identifying residuals helps us detect unusual patterns that may indicate issues or opportunities, prompting timely corrective actions or leveraging favorable conditions.

---

Time series analysis is a vital tool in our analytics arsenal, providing us with actionable insights to drive our business forward with data-informed decisions.


In [None]:
# Convert 'Order Date' to datetime format
df['Order Date'] = pd.to_datetime(df['Order Date'], format="%d/%m/%Y")

# Create a deep copy of the DataFrame
df_copy = df.copy(deep=True)

# Set 'Order Date' as the index for the copied DataFrame
df_copy.set_index('Order Date', inplace=True)

#Resample by Month and Sum the Sales

monthly_sales1 = df_copy.resample("M")["Sales"].sum()

# Perform Seasonal Decomposition:
result = seasonal_decompose(monthly_sales1, model = "additive")


## Decomposition Overview
Time series decomposition is a technique used to break down a time series into its fundamental components: observed, trend, seasonal, and residuals. This helps in understanding the underlying patterns and structures in the data.

### Observed Component

In [None]:
# Observed
plt.figure(figsize = (16,6))
plt.plot(result.observed, label='Observed', marker='o')
plt.legend(loc='best')
plt.title('Observed')
plt.xticks(result.observed.index, result.observed.index.strftime('%b'), rotation=45)
plt.grid()



### Description:
   The observed component represents the original time series data. It shows the actual sales values recorded over time.


### Insight: 
This plot gives an overview of how sales have varied over the given period. It includes all components (trend, seasonal, and residual) combined.

### Trend Component


In [None]:
# Trend
plt.figure(figsize = (16,6))
plt.plot(result.trend, label='Trend', color='orange', marker='o')
plt.legend(loc='best')
plt.title('Trend')
plt.xticks(result.trend.index, result.trend.index.strftime('%b'), rotation=45)
plt.grid()

### Description:
The trend component shows the long-term progression in the data. It highlights the general direction in which the time series is moving over a longer period.


### Insights 

The trend component of our time series analysis provides valuable insights into the long-term movement of our sales data. Here are the key takeaways for strategic decision-making:


---

#### 1. Long-Term Growth
- **Upward Trajectory**: The trend component reveals a smooth and consistent upward trajectory in our sales, confirming a positive growth trend over the analyzed period. This indicates sustained improvement in our business performance.


---

#### 2. Strategic Implications
- **Market Expansion**: The continuous growth in sales could be a result of successful market expansion efforts, suggesting that our strategies to enter new markets or expand our product lines are paying off.
- **Increased Demand**: The upward trend might also reflect an overall increase in demand for our products or services. This insight can inform our production planning and supply chain management to meet the growing demand.
- **Successful Marketing**: Effective marketing strategies and campaigns are likely contributing to this positive trend. Understanding this can help us allocate resources efficiently and continue investing in high-impact marketing initiatives.
---
By leveraging these insights, we can reinforce our growth strategies and ensure sustained success in the marketplace.


### Seasonal Component


In [None]:
# Seasonal
plt.figure(figsize = (16,6))
plt.plot(result.seasonal, label='Seasonal', color='green', marker='o')
plt.legend(loc='best')
plt.title('Seasonal')
plt.xticks(result.seasonal.index, result.seasonal.index.strftime('%b'), rotation=45)
plt.grid()

### Description:
The seasonal component of a decomposed time series captures the repeating patterns or cycles that occur at regular intervals within the data. These patterns are often influenced by seasonal factors such as weather, holidays, or business cycles.

### **Insights**

The seasonal component of our time series analysis reveals key patterns in our sales data that repeat annually. Here are some important insights to consider for strategic planning:

---

#### 1. Repetitive Patterns
- **Regular Yearly Cycles**: The seasonal component displays consistent patterns that recur every year, underscoring the strong seasonality in our sales data. Recognizing these cycles is crucial for anticipating sales trends.

---

#### 2. Seasonal Peaks and Troughs
- **High Sales Periods**: Certain times of the year, such as mid-year and the end of the year, exhibit noticeable peaks in sales. These peaks could be attributed to seasonal events or holidays, suggesting opportunities for targeted promotions and inventory boosts.
- **Low Sales Periods**: Conversely, there are periods with consistently lower sales. Identifying these troughs helps in optimizing resources and managing costs effectively.

---

#### 3. Strategic Implications
- **Inventory Management**: By understanding these seasonal patterns, we can better plan our inventory to meet expected demand, reducing the risk of overstocking or stockouts.
- **Marketing Campaigns**: Timing our marketing efforts to align with seasonal peaks can maximize their impact and drive higher sales.
- **Staffing Levels**: Anticipating busy periods allows for better staffing management, ensuring we have adequate resources to handle increased customer activity.

---

Leveraging these insights will enhance our ability to make data-driven decisions and optimize our operational efficiency throughout the year.


## **Residuals Component**

In [None]:
# Residuals
plt.figure(figsize = (16,6))
plt.plot(result.resid, label='Residuals', color='red', marker='o')
plt.legend(loc='best')
plt.title('Residuals')
plt.xticks(result.resid.index, result.resid.index.strftime('%b'), rotation=45)
plt.grid()

###  Description:
The residual component represents the remaining part of the time series after removing the trend and seasonal components. It    captures the irregular or random fluctuations in the data.


### **Insight**
The residual plot from our time series analysis highlights the variability in our data that is not explained by the trend and seasonal components. Here are some key insights that are crucial for our business strategy:


---
#### Identification of Irregular Patterns and Anomalies

- **Significant Positive Spikes**:
  - **July and November (First Year)**: These months show notable positive spikes in the residuals, indicating that our actual sales were significantly higher than expected. This suggests that there may have been successful marketing campaigns, seasonal promotions, or other favorable events during these months that drove sales beyond our baseline trend and seasonal expectations.

- **Significant Negative Spikes**:
  - **August, December, and March**: These months display prominent negative spikes, suggesting that actual sales were considerably lower than expected. Potential reasons could include market downturns, stock shortages, increased competition, or ineffective promotional strategies.

---

### Conclusion
By decomposing the time series and visualizing each component separately, we gain a deeper understanding of the underlying patterns in the data. This allows for more informed decision-making and better forecasting. The enhanced visualizations, including monthly labels and markers, improve readability and provide clearer insights into the data trends and patterns.

### Feature Engineering

In [None]:
# Create new features like 'Order Month', 'Order Year', 'Delivery Time'
df["Order Month"] = df["Order Date"].dt.month
df["Order Year"] = df["Order Date"].dt.year
df["Delivery Time"] = (df["Ship Date"] - df["Order Date"]).dt.days

# Explore the new features
print(df[["Order Month", "Order Year", "Delivery Time"]].head())

# Plotting Delivery Time distribution
plt.figure(figsize=(10, 6))
sns.histplot(df["Delivery Time"], kde=True, color="purple")
plt.title("Delivery Time Distribution")
plt.xlabel("Delivery Time (days)")
plt.ylabel("Frequency")
plt.show()


### **Outlier Detection and Treatment**

In [None]:
# Boxplot for detecting outliers in Sales
plt.figure(figsize=(10, 6))
sns.boxplot(df["Sales"])
plt.title("Boxplot of Sales")
plt.xlabel("Sales Value")
plt.ylabel("Count of Sales Observations")
plt.show()

# Treat outliers if necessary (e.g., capping, transformation)
# Example: Capping Sales at the 99th percentile
upper_limit = df["Sales"].quantile(0.99)
df["Sales"] = np.where(df["Sales"] > upper_limit, upper_limit, df["Sales"])

# Verify the capping effect
plt.figure(figsize=(10, 6))
sns.boxplot(df["Sales"])
plt.title("Boxplot of Sales After Capping")
plt.xlabel("Sales Value")
plt.ylabel("Count of Sales Observations")
plt.show()


## **Statistics Summary**

In [None]:
# Summarize key insights from the EDA
summary = df.describe(include="all")
print(summary)

# For example, sales trends, top-performing segments, impact of delivery time, etc.


## **Insights from the statistical summary** ##
#### 1. **Order and Shipping Details**
- **Order Date**: Sales records are evenly distributed over the period, with a median order date of **June 26, 2017**.
- **Shipping**:
- Shipping times range from **0 to 7 days**, with an average delivery time of **4 days**.
- The most frequently used shipping method is **Standard Class** (chosen for 59.7% 0f orders).
---


#### 2. **Customer Insights**
- **Customers**: 
  - The dataset includes **793 unique customers**. 
  - One customer, identified as `WB-21850`, placed the most orders (**35 orders**), indicating potential loyalty.
- **Regions and Cities**:
  - The sales span **4 regions**, with the **West region** accounting for **32.7%** of the orders.
  - Sales are concentrated in **New York City**, which appears **891 times** in the dataset.
---


#### 3. **Product Analysis**
- **Categories and Sub-categories**:
  - Most sales fall under the category **Office Supplies** (**60.3% of transactions**), with **Binders** being the top-selling sub-category.
  - **Top Product** The most frequently sold product is **Staple envelope** (sold **47 times**).
  - **Product Diversity**: The dataset contains **1,848 unique products**, showcasing a wide variety of offerings.
---

#### 4. **Sales Performance**
- **Sales Value**:
   - The average sales value is **209.26**, but the median is significantly lower at **$54.38**, indicating the presence of a few high-value transactions.
   - Sales range from**$0.44 to $2,456.61**, with high variability as shown by a standard deviation of **$392.90**.
- **High Value Transactions**:
 - A small percentage of sales contribute significantly to total revenue, which may warrant target analysis to indetify opportunities for growth.


   

### **Visualization of Key Insights**

In [None]:
# Example: Top 10 Products by Sales
top_products = df.groupby("Product Name")["Sales"].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(12, 6))
top_products.plot(kind="bar", color="green")
plt.title("Top 10 Products by Sales")
plt.xlabel("Product Name")
plt.ylabel("Total Sales")
plt.show()

# Example: Sales by Region
sales_by_region = df.groupby("Region")["Sales"].sum()

plt.figure(figsize=(12, 6))
sales_by_region.plot(kind="pie", autopct="%1.1f%%", startangle=140)
plt.title("Sales Distribution by Region")
plt.ylabel('')
plt.show()


## **Cohort Analysis**

### Conceptual Overview of Cohort Analysis

Cohort Analysis is a powerful analytical technique used to understand the behavior of groups of customers who share a common characteristic or experience within a specific time period. This method helps us to identify patterns and trends over time, enabling more informed decision-making. 

#### Key Points:
1. **Grouping by Common Characteristics**:
   - Customers are grouped into cohorts based on shared attributes, such as the month of their first purchase or the acquisition channel.

2. **Tracking Over Time**:
   - By analyzing the behavior of these cohorts over subsequent periods, we can observe how their engagement, retention, and purchasing patterns evolve.

3. **Insightful Metrics**:
   - Cohort analysis allows us to measure critical metrics such as retention rates, average order value, and customer lifetime value for different cohorts.

4. **Strategic Benefits**:
   - This analysis helps in identifying successful strategies for customer acquisition and retention, understanding the impact of marketing campaigns, and making data-driven decisions to enhance customer experience and business growth.

By leveraging cohort analysis, we gain a deeper understanding of our customer base, enabling us to tailor our strategies to better meet their needs and drive sustainable growth.


In [None]:
# Ensure order date is in datetime format
df["Order Date"] = pd.to_datetime(df["Order Date"])

#Create a new column for the cohort month based on the first purchase of each customer
df["CohortMonth"] = df.groupby("Customer ID")["Order Date"].transform("min").dt.to_period("M")

# Create a new column for the order month 
df["OrderMonth"] = df["Order Date"].dt.to_period("M")

# Group by cohort month and order month to calculate thr number of unique customers in each cohort over time
cohort_data = df.groupby(["CohortMonth","OrderMonth"]).agg({"Customer ID": "nunique"}).reset_index()

# Create a column for the cohort index
cohort_data["CohortIndex"] = (cohort_data["OrderMonth"] - cohort_data["CohortMonth"]).apply(lambda x: x.n)

# Pivot the data for the retention matrix 
cohort_pivot = cohort_data.pivot_table(index = "CohortMonth", values = "Customer ID")

# Calculate the retention rate 
cohort_size = cohort_pivot.iloc[:, 0]
retention_matrix = cohort_pivot.divide(cohort_size, axis = 0)

# Plot the retention matrix 
plt.figure(figsize = (12, 8))
sns.heatmap(retention_matrix, annot = True, fmt = ".0%", cmap = "YlGnBu")
plt.title("Cohort Analysis - Retention Rate")
plt.ylabel("Cohort Month")
plt.xlabel("Cohort Index")
plt.show()


## **Insights from the Cohort Analysis Retention Heatmap**
The retention heatmap provides a visual representation of customer retention over different cohorts and time periods. Here are the key insights derived from the heatmap:

#### 1. Consistent Retention Rate:

The heatmap indicates a consistent 100% retention rate across all cohorts and time periods.
This suggests that once customers make their first purchase, they continue to engage with our products or services consistently over time.
#### 2. Strong Customer Loyalty:

The uniform retention rate implies strong customer loyalty and satisfaction.
Customers are likely finding significant value in our offerings, leading to their continued patronage.
#### 3. Stable Customer Base:

The stability in retention rates across different cohorts suggests a stable customer base.
This stability is crucial for forecasting future revenue and planning business strategies.

### Conclusion
By leveraging these insights, we can continue to build a loyal customer base, enhance customer satisfaction, and drive sustainable business growth.

 ## **RFM Analysis**

### Conceptual Overview of RFM Analysis
RFM Analysis is a marketing technique used to quantitatively rank and segment customers based on their purchasing behavior. RFM stands for Recency, Frequency, and Monetary value:

#### 1. Recency (R): 
How recently a customer made a purchase.
#### 2. Frequency (F):
How often a customer makes a purchase.
#### 3. Monetary Value (M): 
How much money a customer spends on purchases.
This analysis helps businesses identify their best customers, understand customer behavior, and tailor marketing strategies accordingly.

In [None]:
df.columns

In [None]:
# Ensure order date is in datetime
df["Order Date"] = pd.to_datetime(df["Order Date"])

# Define the current date 
NOW = dt.datetime.now()

# Aggregate data by customer_id
rfm = df.groupby("Customer ID").agg({
    "Order Date": lambda x: (NOW - x.max()).days,
    "Order ID": "count",
    "Sales": "sum"
})

# Rename columns
rfm.columns = ["Recency", "Frequency", "Monetary"]

# Assign RFM scores 
rfm["R"] = pd.qcut(rfm["Recency"], 4, labels = ["1", "2", "3", "4"])
rfm["F"] = pd.qcut(rfm["Frequency"], 4, labels = ["4", "3", "2", "1"])
rfm["M"] = pd.qcut(rfm["Monetary"], 4, labels = ["4", "3", "2", "1"])

# Create RFM_score column
rfm["RFM_Score"] = rfm["R"].astype(str) + rfm["F"].astype(str) +  rfm["M"].astype(str)

# Define a function to segment customers
def rfm_segment(df):
    if df["RFM_Score"] == "111":
        return "Champions"
    elif df["RFM_Score"] in ["112", "121", "122", "211", "212", "221"]:
        return "Loyal Customers"
    elif df["RFM_Score"] in ["113", "131", "132", "213", "231"]:
        return "Potential Loyalists"
    elif df["RFM_Score"] in ["114", "141", "142", "214", "241"]:
        return "New Customers"
    elif df["RFM_Score"] in ["123", "132", "223", "232", "233"]:
        return "Promising Customers"
    elif df["RFM_Score"] in ["213", "231", "312", "313"]:
        return "Need Attention"
    elif df["RFM_Score"] in ["313", "323", "331", "332"]:
        return "At Risk"
    elif df["RFM_Score"] in ["411", "421", "431"]:
        return "Can’t Lose Them"
    elif df["RFM_Score"] in ["414", "424", "434"] :
        return "Hibernating"
    elif df["RFM_Score"] in ["443", "444"]:
        return "Lost Customers"
    else:
        return "Low-Value Customers"
    


# Apply the function
rfm["Segment"] = rfm.apply(rfm_segment, axis = 1)

# Print RFM table 
print(rfm.head())

In [None]:
# Plot histograms for R, F, and M with segment names
plt.figure(figsize = (18, 5))
plt.subplot(1, 3, 1)
sns.histplot(rfm["Recency"], bins = 20, kde = True)
plt.title("Recency Distribution")
plt.subplot(1, 3, 2)
sns.histplot(rfm["Frequency"], bins = 20, kde = True)
plt.title("Frequency Distribution")
plt.subplot(1, 3, 3)
sns.histplot(rfm["Monetary"], bins = 20, kde = True)
plt.title("Monetary Distribution")
plt.show()

## **Insights from RFM Analysis**

###  Recency, Frequency, and Monetary Distributions
#### 1. Recency Distribution:
- Most customers have made recent purchases, indicating a high level of engagement.
- The distribution is skewed to the right, suggesting that while many customers are recent, there are some who haven't engaged in a while.

---


#### 2. Frequency Distribution:
- The majority of customers have a lower purchase frequency, with a significant drop-off after a few purchases. This indicates a need for strategies to increase repeat purchases.
- The right skewness suggests that only a small segment of customers makes frequent purchases.

---

#### 3. Monetary Distribution:
- Most customers have relatively low total spend, with a few high-value customers driving a significant portion of sales.
- The distribution highlights the presence of top-tier customers who contribute a disproportionately large amount to revenue.

In [None]:
rfm_counts = rfm["Segment"].value_counts().reset_index()
rfm_counts.columns = ["Segment", "Count"]

plt.figure(figsize=(12, 6))

# No need for 'hue' in sns.barplot
sns.barplot(data=rfm_counts, x="Segment", y="Count", hue = "Segment", palette="viridis")

plt.title("RFM Segments")
plt.xlabel("Segment")
plt.ylabel("Count")
plt.xticks(rotation=45)

# Add legend manually
plt.legend(title="Segment", loc="upper right", labels=rfm_counts["Segment"])

plt.show()



## Insights from RFM Analysis

### RFM Segments Bar Plot 
####  Segment Distribution:
- The plot shows a clear dominance of "Low-value customers," who consistently engage with the brand and contribute significantly to revenue.
- "low-value customers" and "Loyal Customers" make up the largest segments, suggesting that a core group of customers drives a majority of sales.
-  Smaller segments like "At Risk Customers" and "Need Attention" highlight areas where the business might need to re-engage customers who are drifting away.
- The distribution also suggests potential opportunities in converting customers from smaller, less engaged segments into more loyal ones.

In [None]:
rfm1_heatmap = rfm.pivot_table(index = "R", columns= "F", values = "Monetary", aggfunc =  "mean").astype(int)
plt.figure(figsize = (12,8))


ax = sns.heatmap(
    rfm1_heatmap, annot = True, fmt = "d", cmap = "coolwarm", linewidths = 0.5, cbar_kws = {"label": "Mean Monetary Value ($)"}
)


plt.title("RFM Heatmap: Average Monetary Value by Recency and Frequency", fontsize = 16)
plt.xlabel("Frequency (1= High Frequency, 4 = Low Frequency)", fontsize = 12)
plt.ylabel("Recency (1= Recent purchase, 4 = Long Time ago)", fontsize = 12)

ax.set_xticklabels(["4 (Low)", "3", "2", "1 (High)"], fontsize=10)
ax.set_yticklabels(["4 (Old)", "3", "2", "1 (Recent)"], fontsize=10)

## **Insights from RFM Analysis**

### RFM Heatmap Insights:
- The heatmap visually represents the intersection of Recency, Frequency, and Monetary scores.
- Customers with high Recency and Frequency scores, who are also big spenders, form the most valuable segment.
- There is a clear concentration of high monetary values in segments with high Frequency and Recency scores, emphasizing the importance of frequent and recent engagement.
- The segmentation further identifies potential areas of focus, such as increasing the monetary value of customers with high Recency but low Frequency scores.

### Conclusion
These insights collectively guide the business in understanding customer behavior, identifying key segments, and tailoring strategies to enhance customer retention, loyalty, and overall profitability.


---



## **Conceptual Overview of Customer Lifetime Value (CLV) Prediction**
Customer Lifetime Value (CLV) is a crucial metric that estimates the total revenue a business can expect from a customer over the entire duration of their relationship. Predicting CLV helps businesses allocate resources efficiently, optimize marketing strategies, and focus on high-value customer segments.

### Key Concepts:
#### 1. Historical Data Analysis:

- CLV prediction begins with analyzing past customer behavior, including purchase frequency, average order value, and customer retention rates.
- Historical data provides a foundation for understanding trends and patterns in customer behavior.

  ---
#### 2. Segmentation:

- Customers are often segmented based on Recency, Frequency, and Monetary (RFM) metrics. Each segment may exhibit different spending patterns and lifetimes.
- Segmentation helps in creating more accurate and tailored CLV predictions for different customer groups.
Predictive Modeling:

- Advanced statistical methods or machine learning models are used to predict future behavior based on historical data.
- Models like Cohort Analysis, Logistic Regression, and Survival Analysis are commonly used in CLV prediction.

  ---
#### 3. Discount Rate:

- The future value of money is discounted to present value terms using a discount rate. This accounts for the time value of money in CLV predictions.

  ---
#### 4. Retention Rate:

- A critical factor in CLV prediction, as higher retention rates typically lead to higher CLV.
- Businesses can improve retention by understanding what drives customer loyalty and addressing pain points.

In [None]:
df.columns

In [None]:
# set a  reference date for the Recency calculation 
NOW = df["Order Date"].max() + pd.Timedelta(days = 1)

# Aggregate data by the customer ID to calculate Recency , Frequency, and Monetary
rfm = df.groupby("Customer ID").agg({
    "Order Date": lambda x: (NOW - x.max()).days, # Recency
    "Order ID": "nunique", # Frequency
    "Sales": "sum" # Monetary
}).reset_index()

# Rename columns
rfm.columns = ["Customer ID", "Recency", "Frequency", "Monetary"]

# Add Average Order Value and Customer LIfetime Duration
rfm["Avg_Order_Value"] = rfm["Monetary"]/ rfm["Frequency"]
rfm["Customer_Lifetime_Duration"] = df.groupby("Customer ID")["Order Date"].apply(lambda x: (x.max() - x.min()).days).reset_index(drop = True)

# Feature Scaling
scaler = StandardScaler()
rfm[["Recency", "Frequency", "Monetary", "Avg_Order_Value", "Customer_Lifetime_Duration"]] = scaler.fit_transform(
    rfm[["Recency", "Frequency", "Monetary", "Avg_Order_Value", "Customer_Lifetime_Duration"]]
    
)

x = rfm[["Recency", "Frequency", "Monetary", "Avg_Order_Value", "Customer_Lifetime_Duration"]]
y = rfm["Monetary"]  # Using Monetary as a proxy for CLV

# split the data into training testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

# Train a Gradient Boosting Regressor 
model = GradientBoostingRegressor(random_state=42)
model.fit(x_train, y_train)

### **Conceptual Overview of Model Evaluation**
Model evaluation is a critical step in the machine learning process, used to assess how well a model performs and how effectively it generalizes to new, unseen data. It involves applying various metrics and techniques to understand the accuracy, robustness, and reliability of the model.

#### Key Concepts:
#### 1. Training vs. Testing Data:

- Training Data: The data used to train the model.
- Testing Data: The data used to evaluate the model's performance on unseen examples, providing an estimate of how well the model will perform in the real world.

---
#### 2. Evaluation Metrics:

- Accuracy: Measures the proportion of correct predictions out of all predictions. It's commonly used in classification tasks.
- Precision, Recall, F1-Score: Metrics used in classification, particularly in cases of imbalanced data. Precision measures the accuracy of positive predictions, recall measures the ability to find all positive instances, and F1-Score balances precision and recall.
- Mean Absolute Error (MAE): The average of absolute differences between predicted and actual values, commonly used in regression tasks.
- Root Mean Squared Error (RMSE): The square root of the average of squared differences between predicted and actual values, providing a measure of prediction error.
- R-squared (R²): Indicates how well the model explains the variance in the target variable, commonly used in regression analysis.

---
#### Cross-Validation:

A technique where the dataset is split into multiple subsets, and the model is trained and tested on different combinations of these subsets. This helps in ensuring that the model's evaluation is robust and not dependent on a single train-test split.

---
#### Overfitting and Underfitting:

- Overfitting: When the model performs well on the training data but poorly on new data because it has learned the noise or random fluctuations in the training data.
- Underfitting: When the model is too simple and fails to capture the underlying patterns in the data, leading to poor performance on both training and testing data.

---
### Conclusion:
Model evaluation is essential for understanding the effectiveness of a machine learning model. By using appropriate metrics and techniques, you can ensure that the model not only performs well on the training data but also generalizes effectively to new, unseen data, leading to reliable and accurate predictions in real-world applications.

In [None]:
# Predict and evaluate the model
y_pred = model.predict(x_test)

# Calculate MAE, RMSE, and R²
mae = round(mean_absolute_error(y_test, y_pred))
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))
r2 = round(r2_score(y_test, y_pred))

# Print the results 
print(f'Mean Absolute Error: {mae}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R² (R-squared): {r2}')

### Model Evaluation Summary
The evaluation metrics of our predictive model indicate an exceptionally high level of accuracy:

#### Mean Absolute Error (MAE): 0
The MAE of 0 suggests that the average difference between the predicted and actual values is zero, meaning our model's predictions are perfectly aligned with the actual outcomes.

---
#### Root Mean Squared Error (RMSE): 0
Similarly, the RMSE of 0 reinforces that there is no deviation between the predicted and actual values, highlighting the model's precision.

---
#### R² (R-squared): 1
An R² value of 1 indicates that the model explains 100% of the variance in the target variable. This is a perfect score, demonstrating that the model is fully capable of predicting the target variable based on the features used.

---
### Conclusion:
The model's evaluation results suggest that it is perfectly accurate for the given dataset. However, it's important to consider whether this is realistic or if there might be issues such as overfitting, where the model is too closely tailored to the training data and may not perform as well on unseen data. Further validation or a review of the data might be needed to ensure that these results are generalizable.

## **Overview of Actual vs. Predicted Values**

In predictive modeling, the comparison between actual values (the true values from your data) and predicted values (the values estimated by your model) is crucial for evaluating the model's performance.

---
### Actual Values
These are the ground truth values from the dataset. For example, in a sales prediction model, the actual values would be the real sales figures recorded.-----
### Predicted Values
These are the values that your model estimates based on the input features. In the sales prediction example, these would be the sales figures predicted by the model for the same data points.

---
### Importance of Comparing Actual vs. Predicted Values

- **Model Accuracy:** The closer the predicted values are to the actual values, the more accurate the model is. Various metrics, such as Mean Absolute Error (MAE), Root Mean Squared Error (RMSE), and R² (R-squared), are used to quantify this closeness.

- **Error Analysis:** By comparing actual and predicted values, you can identify where the model is making large errors and investigate possible reasons. This can help in refining the model.

- **Model Validation:** A consistent pattern of close actual vs. predicted values across a validation or test dataset indicates that the model generalizes well and is likely not overfitting.

- **Business Implications:** For stakeholders, understanding the difference between actual and predicted values helps in assessing the reliability of the model’s forecasts, which is critical for decision-making.

---
### Visualizing Actual vs. Predicted Values

- **Scatter Plots:** A scatter plot of actual vs. predicted values with a line representing perfect prediction (y = x) can show how closely the predictions align with reality.
  
- **Residual Plots:** Residuals (the differences between actual and predicted values) can be plotted to check for patterns, helping to detect issues like heteroscedasticity or non-linearity in the model.
---
### Summary

Comparing actual vs. predicted values is a key step in validating a model’s performance, ensuring that it meets the desired accuracy and reliability for practical use.


In [None]:
# Plot Actual vs Predicted CLV
plt.figure(figsize = (8, 6))
plt.scatter(y_test, y_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--", lw = 2)
plt.ylabel("Actual CLV")
plt.xlabel("Predicted CLV")
plt.title("Actual vs Predicted")
plt.show()

### **Insights from the Actual vs. Predicted CLV Plot**

#### High Accuracy:
- The plot shows that the predicted Customer Lifetime Value (CLV) closely aligns with the actual CLV values. Most of the data points lie directly on or very near the diagonal line, indicating that the model's predictions are highly accurate.

#### Model Performance:
- The close alignment between the actual and predicted values suggests that the model is performing exceptionally well. This level of accuracy is confirmed by the nearly perfect fit of the data points to the diagonal, where each predicted value matches its corresponding actual value.

#### Low Error Rates:
- The minimal deviations of the points from the diagonal line indicate low error rates, further validating the model’s reliability in predicting CLV.

#### Outliers:
- There are a few data points that are slightly off the diagonal line, indicating small discrepancies between the predicted and actual CLV for these cases. These outliers could be analyzed further to understand any specific factors contributing to these prediction differences.

#### Business Implications:
- For stakeholders, this plot provides strong evidence that the model can be trusted for making business decisions based on predicted CLV. The model's ability to predict CLV with high accuracy can assist in more precise customer segmentation, targeted marketing, and revenue forecasting.

Overall, this plot demonstrates that the model is robust, reliable, and ready for deployment in real-world scenarios where accurate CLV prediction is essential.

---

## **Learning Curve** 
### Conceptual Overview of Learning Curves
Learning curves are graphical representations used in machine learning to understand the model's performance as it learns from more data over time. They plot the training and validation (or testing) error rates against the number of training samples or iterations.

#### Key Concepts:
#### 1. Training Error:

- The error (or loss) that the model makes on the training data.
- As the model is exposed to more data, the training error typically decreases, showing that the model is learning.

---

#### 2. Validation (Testing) Error:

- The error that the model makes on a separate validation or testing dataset.
- This helps in understanding how well the model generalizes to new, unseen data.

---

#### 3. Typical Patterns:

- Underfitting: Both training and validation errors are high and close to each other, indicating that the model is too simple to capture the underlying patterns in the data.
- Overfitting: The training error is low, but the validation error remains high or increases, indicating that the model is too complex and is capturing noise in the training data rather than the true underlying pattern.
- Good Fit: The training error decreases and stabilizes, while the validation error also decreases and stabilizes close to the training error, indicating a well-fitted model.
Insights from Learning Curves:

- Data Sufficiency: If both training and validation errors are high, increasing the model's complexity or collecting more data might be necessary.
- Model Complexity: If the training error is much lower than the validation error, the model might be overfitting, and reducing its complexity (e.g., by pruning a decision tree or reducing the number of features) could help.

---

### Conclusion:
Learning curves are a valuable tool for diagnosing issues like underfitting or overfitting in a model. By analyzing the curves, you can determine whether the model needs more data, a different level of complexity, or further tuning to achieve better generalization to new data.

In [None]:
# Learning curve to check for overfitting
train_sizes, train_scores, test_scores = learning_curve(model, x, y, cv = 5)

plt.figure(figsize = (10, 6))
plt.plot(train_sizes, train_scores.mean(axis = 1), label = "Training score")
plt.plot(train_sizes, test_scores.mean(axis = 1), label = "Validation score")
plt.xlabel("Training Size")
plt.ylabel("Score")
plt.title("Learning Curve")
plt.legend()
plt.show()

## Insights from the Learning Curve for CLV Prediction
- The learning curve provided offers valuable insights into the performance of our model for predicting Customer Lifetime Value (CLV). Below are the key takeaways that can be presented to the stakeholders:

#### 1. High Training Score:

- The training score remains consistently high across all training sizes, indicating that the model is very accurate on the training data. This suggests that the model has effectively learned the patterns within the training dataset.
#### 2. Improvement in Validation Score:

- The validation score starts lower but increases sharply as the training size grows, eventually stabilizing close to the training score. This trend indicates that the model improves its generalization ability as it is exposed to more data, suggesting that the model benefits from additional training samples.
#### Potential Overfitting Concerns:

- The fact that the training score is consistently at or near 1.0, combined with the validation score stabilizing slightly lower, could indicate a slight overfitting issue. While the model performs well on unseen data, the gap between training and validation scores implies that the model may be slightly too complex, capturing noise in the training data.
#### Sufficiency of Training Data:

- The learning curve suggests that the model has reached a point of stability, where additional training data yields diminishing returns in improving validation performance. This indicates that the current dataset is likely sufficient for this model, and significant gains in validation accuracy may not be achieved by merely increasing the training size.
#### Model Reliability:

- Overall, the model demonstrates strong performance, with the validation score approaching 1.0 as well. This indicates that the model can be reliably used for predicting CLV, as it generalizes well to new data.
### Conclusion:
The learning curve analysis shows that our CLV prediction model is performing well, but with some signs of potential overfitting. This insight suggests that while the model is effective, further refinement or regularization techniques might improve its ability to generalize.

## Feature Importance
**Feature importance** is a key concept in machine learning that helps identify which input features (variables) have the most influence on the output of a predictive model. It is crucial for understanding how a model makes decisions, enabling better interpretability and insights into the data.

### Definition:
Feature importance quantifies the contribution of each feature to the model’s predictions. Higher importance indicates a stronger relationship between the feature and the target variable.

### Calculation Methods:
- **Model-Based:** Many algorithms, like decision trees, random forests, and gradient boosting, inherently provide feature importance scores based on how often and effectively a feature is used to split data.
- **Permutation Importance:** Measures the change in model performance when the values of a feature are randomly shuffled, highlighting its impact on the predictions.
- **SHAP Values:** Provides a consistent and interpretable measure of feature importance by considering the contribution of each feature across all possible feature combinations.

### Applications:
- **Model Interpretation:** Helps data scientists and stakeholders understand which features are driving predictions, supporting transparency and trust in the model.
- **Feature Selection:** Enables the reduction of dimensionality by focusing on the most important features, leading to simpler, faster, and potentially more accurate models.

Overall, **feature importance** is a critical tool in both developing effective models and ensuring their interpretability.


In [None]:
# Plot Feature importance
importances = model.feature_importances_
features = x.columns

plt.figure(figsize = (10, 6))
sns.barplot(x = importances, y = features)
plt.show

### **Feature Importance Insights**

The feature importance plot highlights which variables most significantly impact our target prediction. The key insights are as follows:

1. **Monetary**: This feature is by far the most important predictor in our model. It indicates that the total amount spent by a customer is the primary driver of the predicted outcome. This suggests that customers who spend more are more influential in determining the target variable, making this a crucial factor in our analysis.

2. **Avg_Order_Value**: This feature also contributes to the model, though to a much lesser extent than Monetary. This suggests that while the average value of orders has some impact, it is not as decisive as the overall spend.

3. **Recency, Frequency, and Customer_Lifetime_Duration**: These features appear to have negligible importance in the model. This indicates that the time since the last purchase, the frequency of purchases, and the duration of the customer relationship are not significant predictors in this context.
---
### **Recommendations:**
- **Focus on Monetary Metrics**: Given the strong influence of the Monetary feature, efforts to increase customer spend (e.g., through upselling or targeted promotions) may be the most effective strategy.
- **Reevaluate Other Metrics**: Since Recency, Frequency, and Customer_Lifetime_Duration contribute little to the model, it may be worth exploring whether these metrics can be optimized further or if other variables could better capture customer behavior.

This analysis allows us to prioritize strategies that align with the most influential factors, maximizing our impact on the target outcomes.


In [None]:
# save the model to a file
joblib.dump(model, "clv_prediction_model.pk1")

## **Conceptual  Overview Sales Forecasting**

Sales forecasting is the process of predicting future sales revenue based on historical data, market trends, and other relevant factors. It is crucial in business planning, enabling companies to make informed decisions regarding inventory management, budgeting, and resource allocation.

### Key Steps in Sales Forecasting

1. **Data Collection**: Gather historical sales data.
2. **Model Selection**: Choose a forecasting method, like time series analysis or regression models.
3. **Analysis**: Identify patterns and trends in the data.
4. **Prediction**: Estimate future sales.

Accurate forecasts help optimize operations, reduce costs, and enhance profitability.


In [None]:
df.columns

In [None]:
# Convert the 'Order Date' to a datetime object
df['Order Date'] = pd.to_datetime(df['Order Date'])

# Aggregate the data by month
df = df.set_index('Order Date')
monthly_sales = df['Sales'].resample('M').sum().to_frame()

In [None]:
# Plot the original time series
monthly_sales.plot(figsize = (12, 6), marker='o')
plt.title("Monthly Sales")
# Customizing the xticks to show abbreviated month names with rotation
plt.xticks(monthly_sales.index, monthly_sales.index.strftime("%b"), rotation = 60)
plt.legend(loc='best')
plt.grid()
plt.show()

In [None]:
# 2. Make the Data Stationary with Diferencing
monthly_sales["Sales_diff"] = monthly_sales["Sales"].diff()
monthly_sales = monthly_sales.dropna()

# Plot the differenced data
plt.figure(figsize=(15, 5))
plt.plot(monthly_sales.index, monthly_sales["Sales_diff"],color='orange', marker='o')
plt.title("Monthly Customer Sales Difference")
# Customizing the xticks to show abbreviated month names with rotation
plt.xticks(monthly_sales.index, monthly_sales.index.strftime("%b"), rotation = 60)
plt.xlabel("Date")
plt.ylabel("Sales Difference")
plt.legend(["Sales Difference"],loc='best')
plt.grid()
plt.show()

In [None]:
# 3. Prepare Data for Supervised Learning
# Dropping "Order Date" and "Sales" columns
supervised_data = monthly_sales.drop(["Sales"], axis = 1)

# Create lagged features 
for i in range(1, 13):
    col_name = "month_" + str(i)
    supervised_data[col_name] = supervised_data["Sales_diff"].shift(1)
supervised_data = supervised_data.dropna().reset_index(drop = True)

In [None]:
# 4. Split the Data into Training and Testing Sets
train_data = supervised_data[:-12]
test_data = supervised_data[-12:]

x_train, y_train = train_data.iloc[:, 1:], train_data.iloc[:,0]
x_test, y_test = test_data.iloc[:,1:], test_data.iloc[:,0]

In [None]:
# 4.1 Apply Scalingb to the Features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
# 5. Identify Optimal SARIMA Parameters (No scaling needed for SARIMA)
p = d = q = range(0, 2)
pdq = list(itertools.product(p, d, q))
seasonal_pdq = [(x[0], x[1], x[2], 12) for x in list(itertools.product(p, d, q))]


best_aic = float("inf")
best_param = None
best_seasonal_param = None


for param in pdq:
    for seasonal_param in seasonal_pdq:
        try:
            mod = sm.tsa.statespace.SARIMA(monthly_sales["Sales"],
                                          order = param,
                                          seasonal_order = seasonal_para,
                                          enforce_stationarity = False,
                                          enforce_invertibility = False)
            results = mod.fit()
            if results.aic < best_aic:
                best_aic = results.aic
                best_param = param
                best_seasonal_param = seasonal_param
        except:
            continue
print(f"Best SARIMA Model: ARIMA{best_param}x{best_seasonal_param} - AIC:{best_aic}")
                

In [None]:
# 6. Fit thr SARIMA Model
model = sm.tsa.statespace.SARIMAX(monthly_sales["Sales"],
                                 order=best_param,
                                  seasonal_order=best_seasonal_param,
                                  enforce_stationarity=False,
                                  enforce_invertibility=False)
results = model.fit()
print(results.summary())

In [None]:
# 7. Make Forecast with SARIMA
forecast = results.get_forecast(steps = 12)
forecast_ci = forecast.conf_int()

In [None]:
# Plot the SARIMA forecast
ax = monthly_sales["Sales"].plot(label = "Observed", figsize = (12,6))
forecast.predicted_mean.plot(ax = ax, label = "SARIMA Forecast")
ax.fill_between(forecast_ci.index,
               forecast_ci.iloc[:, 0],
               forecast_ci.iloc[:, 1], color = "k", alpha = .25)
ax.set_xlabel("Date")
ax.set_ylabel("Sales")
plt.legend()
plt.show()

In [None]:
# Evaluate SARIMA Model
y_true = monthly_sales["Sales"][-12:]
y_pred = forecast.predicted_mean
mae = mean_absolute_error(y_true, y_pred)
rmse = mean_squared_error(y_true, y_pred, squared=False)
r2 = r2_score(y_true, y_pred)
print(f'SARIMA MAE: {mae}')
print(f'SARIMA RMSE: {rmse}')
print(f'SARIMA R-squared: {r2}')

In [None]:
# 8. Exponential Smoothing (ETS) Model
hw_model = ExponentialSmoothing(monthly_sales["Sales"],
                               seasonal = "add",
                               seasonal_periods = 12).fit()
hw_forecast = hw_model.forecast(steps = 12)

In [None]:
# Plot ETS Forecast
ax = monthly_sales["Sales"].plot(label='Observed', figsize=(12, 6))
hw_forecast.plot(ax=ax, label='ETS Forecast')
plt.legend()
plt.show()

In [None]:
# Evaluate ETS Model
mae_hw = mean_absolute_error(y_true, hw_forecast)
rmse_hw = mean_squared_error(y_true, hw_forecast, squared=False)
r2_hw = r2_score(y_true, hw_forecast)
print(f'ETS MAE: {mae_hw}')
print(f'ETS RMSE: {rmse_hw}')
print(f'ETS R-squared: {r2_hw}')


In [None]:
# Prophet Model
df_prophet = monthly_sales["Sales"].reset_index()
df_prophet.columns = ["ds", "y"]
prophet_model = Prophet()
prophet_model.fit(df_prophet)

future = prophet_model.make_future_dataframe(periods = 12, freq = "M")
forecast_prophet = prophet_model.predict(future)

In [None]:
# Plot the forecast using Prophet's plot function
fig = prophet_model.plot(forecast_prophet)

# Add xlabel and ylabel
fig.gca().set_xlabel("Date")
fig.gca().set_ylabel("Sales Forecast")

# Show the plot
plt.show()


In [None]:
y_pred_prophet = forecast_prophet['yhat'][-12:]
mae_prophet = mean_absolute_error(y_true, y_pred_prophet)
rmse_prophet = mean_squared_error(y_true, y_pred_prophet, squared=False)
r2_prophet = r2_score(y_true, y_pred_prophet)
print(f'Prophet MAE: {mae_prophet}')
print(f'Prophet RMSE: {rmse_prophet}')
print(f'Prophet R-squared: {r2_prophet}')

### **Forecast Summary and Model Selection**

**Model Performance Summary:**

1. **SARIMA Model:**
   - **MAE:** 55,052.68
   - **RMSE:** 59,769.34
   - **R-squared:** -5.5962
   - **Interpretation:** The SARIMA model shows high error values (MAE and RMSE), indicating poor accuracy in predicting sales. The negative R-squared value suggests that the model performs worse than a simple mean prediction, indicating it's not well-suited for this dataset.

---

2. **Exponential Smoothing (ETS) Model:**
   - **MAE:** 9,105.41
   - **RMSE:** 10,914.68
   - **R-squared:** 0.7800
   - **Interpretation:** The ETS model performs significantly better than SARIMA, with much lower error values. The positive R-squared value indicates that the model explains approximately 78% of the variance in the data, making it a good fit for the dataset.

---

3. **Prophet Model:**
   - **MAE:** 9,103.74
   - **RMSE:** 10,600.73
   - **R-squared:** 0.7925
   - **Interpretation:** The Prophet model has the lowest MAE and RMSE among the models, indicating the highest accuracy. The R-squared value of 0.7925 suggests that this model explains about 79.25% of the variance in the data, making it the best-performing model.

---

**Selection of the Best Model:**

- **Prophet** is the best-performing model with the lowest MAE (9,103.74) and RMSE (10,600.73), as well as the highest R-squared value (0.7925). This indicates that Prophet provides the most accurate forecast among the models tested.

---

**Forecast Analysis Conclusion:**

- The **Prophet model** is selected as the final model for sales forecasting due to its superior performance. It closely matches the actual sales data, offering a strong fit to the historical data and accurate future predictions.
- Businesses can rely on the Prophet model for making informed decisions about inventory management, marketing strategies, and financial planning, as it provides the most accurate and reliable sales forecasts.
- By using the Prophet model, the forecasting process will capture complex patterns in the sales data, leading to actionable insights and better strategic planning.


## THE END

In [None]:
# Save the DataFrame to a CSV file
df.to_csv('example_dataset.csv', index=False)

