# Descriptive Analytics and Data Preprocessing on Sales & Discounts Dataset.

## Introduction
##	- To perform descriptive analytics, visualize data distributions, and preprocess the dataset for further analysis.


In [None]:
# Importing Libraries.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

## Descriptive Analytics for Numerical Columns

In [None]:
# Loading the dataset from a CSV File.
# convert Date column to datetime.- dates are DD‑MM‑YYYY.

df = pd.read_csv("sales_data_with_discounts.csv",parse_dates=['Date'],dayfirst=True)
df.head()

In [None]:
df.shape # coloumn, rows

In [None]:
# To get information about data types and null values.
df.info() 

In [None]:
# remove exact duplicate rows
df.drop_duplicates(inplace=True)

# strip whitespace from column names & object columns
df.columns = df.columns.str.strip()
obj_cols = df.select_dtypes(include='object').columns
df[obj_cols] = df[obj_cols].apply(lambda s: s.str.strip())

In [None]:
# Identifing the columns types in the dataset.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = df.select_dtypes(exclude=[np.number]).columns.tolist()

In [None]:
# To get statistical summary of numeric data.
df.describe().transpose()

In [None]:
# Descriptive statistics.
desc_stats = pd.DataFrame({
    'Mean'   : df[numeric_cols].mean(),
    'Median' : df[numeric_cols].median(),
    'Mode'   : df[numeric_cols].apply(lambda s: s.mode().iloc[0] if not s.mode().empty else np.nan),
    'StdDev' : df[numeric_cols].std(),
    'Skew'   : df[numeric_cols].skew()
})
print('*** Descriptive Statistics (Numerical Columns) ***')
print(desc_stats)

## Data Visualization

### Histograms :

In [None]:
# Histograms:
'''Ploting histograms for each numerical column & Analyze the distribution '''

sns.set(style='whitegrid')
for col in numeric_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col].dropna(), bins=20, kde=False, color='steelblue')
    plt.title(f'Histogram of {col}')
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()

### Box Plots :

In [None]:
# Box Plots:
''' Creating boxplots for numerical variables to identify outliers and the IQR.'''

plt.figure(figsize=(len(numeric_cols)*4, 6))
df[numeric_cols].boxplot(rot=45, grid=True)
plt.title('Boxplots of Numerical Columns')
plt.tight_layout()
plt.show()

### Bar Chart Analysis for Categorical Column:

In [None]:
# Bar Charts:
# Creating bar charts for categorical columns
for col in categorical_cols:
    plt.figure(figsize=(10, 5))
    order = df[col].value_counts().index
    sns.countplot(y=col, data=df, order=order, palette='viridis')
    plt.title(f'Frequency of {col}')
    plt.xlabel('Count')
    plt.ylabel(col)
    plt.tight_layout()
    plt.show()

## Conclusion

In [None]:
# Simple outlier handling (IQR capping) 
def iqr_cap(series):
    q1, q3 = series.quantile([0.25, 0.75])
    iqr = q3 - q1
    lower, upper = q1 - 1.5*iqr, q3 + 1.5*iqr
    return series.clip(lower, upper)

df[numeric_cols] = df[numeric_cols].apply(iqr_cap)

##### The descriptive analysis provided a clear understanding of the dataset through key statistical measures such as mean, median, and standard deviation. Histograms showed that some numerical variables are right-skewed, indicating occasional high-value sales, while others are more evenly distributed. Box plots helped identify outliers and highlighted the spread of the data. Bar charts revealed differences in category frequencies, showing which categories contribute most to sales. Overall, the dataset is well-structured and suitable for further advanced analysis and modeling.
