<a href="https://colab.research.google.com/github/abdo180/Sales-Data-Analysis-and-Forecasting/blob/main/Sales_Data_Analysis_and_Forecasting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install necessary libraries (you may already have some of these)
!pip install pandas numpy matplotlib seaborn mlxtend statsmodels pmdarima

# Import the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mlxtend.frequent_patterns import apriori, association_rules
from statsmodels.tsa.seasonal import seasonal_decompose
from pmdarima import auto_arima
from sklearn.metrics import mean_absolute_error, mean_squared_error


In [None]:
# Load the dataset
df = pd.read_csv('/content/data.csv', encoding='ISO-8859-1')  # Adjust path if needed

# Display the first few rows
df.head()


In [None]:
# Check for missing values
df.isnull().sum()

# Drop rows with missing CustomerID (if CustomerID is important for analysis)
df = df.dropna(subset=['CustomerID'])

# Convert InvoiceDate to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Remove negative quantities (returns)
df = df[df['Quantity'] > 0]

# Remove duplicates
df = df.drop_duplicates()


In [None]:
# Create a column for Total Sales
df['TotalSales'] = df['Quantity'] * df['UnitPrice']

# Group by month for sales analysis
df['YearMonth'] = df['InvoiceDate'].dt.to_period('M')
monthly_sales = df.groupby('YearMonth')['TotalSales'].sum()

# Plot monthly sales
plt.figure(figsize=(12, 6))
monthly_sales.plot()
plt.title('Total Monthly Sales Over Time')
plt.xlabel('Year-Month')
plt.ylabel('Total Sales')
plt.grid(True)
plt.show()


In [None]:
# Group by product and sum quantities sold
product_sales = df.groupby('Description')['Quantity'].sum().sort_values(ascending=False)
top_products = product_sales.head(10)

# Plot the top products
plt.figure(figsize=(10, 5))
top_products.plot(kind='bar')
plt.title('Top 10 Best-Selling Products')
plt.xlabel('Product')
plt.ylabel('Quantity Sold')
plt.xticks(rotation=90)
plt.show()


In [None]:
import datetime as dt

# Set reference date (the latest date in your dataset + 1 day)
reference_date = df['InvoiceDate'].max() + dt.timedelta(days=1)

# RFM metrics calculation
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (reference_date - x.max()).days,
    'InvoiceNo': 'nunique',
    'TotalSales': 'sum'
})
rfm.columns = ['Recency', 'Frequency', 'Monetary']

# Create RFM segments
rfm['RFM_Segment'] = rfm[['Recency', 'Frequency', 'Monetary']].apply(lambda x: f"{x['Recency']}-{x['Frequency']}-{x['Monetary']}", axis=1)
rfm.head()


In [None]:
# Prepare data for market basket analysis
basket = df.groupby(['InvoiceNo', 'Description'])['Quantity'].sum().unstack().fillna(0).applymap(lambda x: 1 if x > 0 else 0)

# Apply Apriori algorithm
frequent_itemsets = apriori(basket, min_support=0.02, use_colnames=True)

# Generate association rules
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].sort_values(by='lift', ascending=False).head(10)


In [None]:
print("Monthly sales data:")
print(monthly_sales.head())
print("Data shape:", monthly_sales.shape)
print("Missing values:", monthly_sales.isnull().sum())



In [None]:
# Drop NaNs, if any, from monthly_sales
monthly_sales.dropna(inplace=True)


In [None]:
from pmdarima import auto_arima

# Fit the ARIMA model on the cleaned monthly_sales data
# Fit an Auto ARIMA model
model = auto_arima(monthly_sales, seasonal=True, season_length=12, stepwise=True, suppress_warnings=True)


# Forecast for the next 12 months
forecast_period = 12
forecast = model.predict(n_periods=forecast_period)

# Create a forecast index for the next 12 months
forecast_index = pd.date_range(start=monthly_sales.index[-1] + pd.offsets.MonthBegin(), periods=forecast_period, freq='M')
forecast_df = pd.DataFrame(forecast, index=forecast_index, columns=['Forecast'])

# Plot the historical and forecasted sales
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales, label='Historical Sales')
plt.plot(forecast_df, label='Forecasted Sales', linestyle='--')
plt.title('Sales Forecast for the Next 12 Months')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
# In-sample predictions and model evaluation
predicted = model.predict_in_sample()
mae = mean_absolute_error(monthly_sales, predicted)
rmse = np.sqrt(mean_squared_error(monthly_sales, predicted))

print(f"Mean Absolute Error: {mae}")
print(f"Root Mean Squared Error: {rmse}")


In [None]:
# Export cleaned data to CSV
df.to_csv('/content/cleaned_data.csv', index=False)

# Export monthly sales data
monthly_sales.to_csv('/content/monthly_sales.csv', index=True)

# Export RFM segments
rfm.to_csv('/content/rfm_segments.csv', index=True)

# Export Market Basket rules
rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']].to_csv('/content/market_basket_rules.csv', index=False)


In [None]:
!pip install plotly
import plotly.express as px

# Example: Plot interactive monthly sales using Plotly
fig = px.line(monthly_sales, x=monthly_sales.index, y='TotalSales', title='Interactive Monthly Sales')
fig.show()


In [None]:
from sklearn.cluster import KMeans

# Normalize RFM values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
rfm_normalized = scaler.fit_transform(rfm[['Recency', 'Frequency', 'Monetary']])

# Run K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=0)
rfm['Cluster'] = kmeans.fit_predict(rfm_normalized)

# Visualize clusters
fig = px.scatter(rfm, x='Recency', y='Monetary', color='Cluster', title='Customer Segmentation Clusters')
fig.show()


In [None]:
!pip install prophet
from prophet import Prophet

# Prepare data for Prophet
prophet_df = monthly_sales.reset_index()
prophet_df.columns = ['ds', 'y']

# Initialize and fit the model
prophet_model = Prophet(yearly_seasonality=True, weekly_seasonality=False)
prophet_model.fit(prophet_df)

# Make future predictions
future = prophet_model.make_future_dataframe(periods=12, freq='M')
forecast = prophet_model.predict(future)

# Plot forecast
prophet_model.plot(forecast)


In [None]:
# Forecast for the next 12 months
forecast_period = 12
forecast = model.predict(n_periods=forecast_period)

# Create a forecast index for the next 12 months
forecast_index = pd.date_range(start=monthly_sales.index[-1] + pd.offsets.MonthBegin(), periods=forecast_period, freq='M')
forecast_df = pd.DataFrame(forecast, index=forecast_index, columns=['Forecast'])

# Display forecasted values
print(forecast_df)


In [None]:
import matplotlib.pyplot as plt

# Plot the historical and forecasted sales
plt.figure(figsize=(12, 6))
plt.plot(monthly_sales, label='Historical Sales')
plt.plot(forecast_df, label='Forecasted Sales', linestyle='--')
plt.title('Sales Forecast for the Next 12 Months')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

# Make predictions on the historical period (in-sample predictions)
predicted = model.predict_in_sample()

# Calculate MAE and RMSE for model evaluation
mae = mean_absolute_error(monthly_sales, predicted)
rmse = np.sqrt(mean_squared_error(monthly_sales, predicted))

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Root Mean Squared Error (RMSE): {rmse}")


In [None]:

from google.colab import files
files.download('Untitled29.ipynb.ipynb')
