In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset
df = pd.read_csv('/content/sales_data_with_discounts.csv')

# Step 2: Identify Numerical and Categorical Columns
numerical_cols = df.select_dtypes(include=[np.number]).columns
categorical_cols = df.select_dtypes(include=['object']).columns

# Step 3: Descriptive Analytics for Numerical Columns
print("Descriptive Analytics for Numerical Columns:")
for col in numerical_cols:
    print(f"\nStatistics for {col}:")
    print(f"Mean: {df[col].mean()}")
    print(f"Median: {df[col].median()}")
    print(f"Mode: {df[col].mode()[0]}")
    print(f"Standard Deviation: {df[col].std()}")
    print("\n")

# Step 4: Data Visualization
# Histograms for numerical columns
print("Histograms for Numerical Columns:")
df[numerical_cols].hist(bins=15, figsize=(15, 10))
plt.suptitle("Histograms for Numerical Columns")
plt.show()

# Boxplots for numerical columns
print("Boxplots for Numerical Columns:")
for col in numerical_cols:
    plt.figure()
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot for {col}')
    plt.show()

# Bar Charts for Categorical Columns
print("Bar Charts for Categorical Columns:")
for col in categorical_cols:
    df[col].value_counts().plot(kind='bar')
    plt.title(f'Distribution of {col}')
    plt.show()

# Step 5: Standardization of Numerical Variables
standardized_df = df.copy()
for col in numerical_cols:
    standardized_df[col] = (df[col] - df[col].mean()) / df[col].std()

# Compare distributions before and after standardization
print("Distribution Before and After Standardization:")
df[numerical_cols].hist(bins=15, figsize=(15, 10))
plt.suptitle("Before Standardization")
plt.show()

standardized_df[numerical_cols].hist(bins=15, figsize=(15, 10))
plt.suptitle("After Standardization")
plt.show()

# Step 6: Conversion of Categorical Data into Dummy Variables
print("Conversion of Categorical Data into Dummy Variables:")
df_encoded = pd.get_dummies(df, columns=categorical_cols)

# Display part of the transformed dataset
print("Transformed Dataset with Dummy Variables:")
print(df_encoded.head())

# Step 7: Conclusion
print("\nConclusion:")
print("1. Key statistical measures (mean, median, mode, standard deviation) provided insight into the central tendencies and variability of the dataset's numerical columns.")
print("2. Histograms and boxplots helped in identifying skewness and outliers in the data, which are critical for understanding the data's distribution.")
print("3. Bar charts revealed the distribution of categorical variables, helping to understand the frequency of each category.")
print("4. Standardization ensured that numerical variables were on the same scale, improving the dataset's suitability for analytical models.")
print("5. One-hot encoding transformed categorical variables into a numerical format that is compatible with machine learning algorithms.")
print("These preprocessing steps are essential for ensuring that the dataset is ready for further analysis and machine learning tasks.")
