In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = '/kaggle/input/ecommerce-dataset-for-data-analysis/project1_df.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

In [None]:
# Checking for missing values and data types
missing_values = data.isnull().sum()
data_types = data.dtypes
data['Purchase Date'] = pd.to_datetime(data['Purchase Date'], format='%d/%m/%Y %H:%M:%S')
summary_stats = data.describe()

# Display missing values, data types, and summary statistics
print("Missing Values:\n", missing_values)
print("\nData Types:\n", data_types)
print("\nSummary Statistics:\n", summary_stats)

In [None]:
# Step 1: Visualizing distributions of numerical features
plt.figure(figsize=(10,6))

In [None]:
# Step 1: Bar plot for Product Category
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Product Category', palette='Set3')
plt.title('Count of Purchases by Product Category')
plt.xticks(rotation=45)
plt.show()

# Step 2: Bar plot for Age Group
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Age Group', palette='Set2')
plt.title('Count of Purchases by Age Group')
plt.xticks(rotation=45)
plt.show()

# Step 3: Bar plot for Gender
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Gender', palette='Set1')
plt.title('Count of Purchases by Gender')
plt.show()

# Step 4: Bar plot for Purchase Method
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Purchase Method', palette='Set2')
plt.title('Count of Purchases by Purchase Method')
plt.xticks(rotation=45)
plt.show()

# Step 5: Bar plot for Product Category grouped by Age Group and Gender
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Product Category', hue='Age Group', palette='Set3')
plt.title('Count of Purchases by Product Category and Age Group')
plt.xticks(rotation=45)
plt.show()

plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Product Category', hue='Gender', palette='Set2')
plt.title('Count of Purchases by Product Category and Gender')
plt.xticks(rotation=45)
plt.show()

# Step 6: Bar plot for Product Category grouped by Purchase Method
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Product Category', hue='Purchase Method', palette='Set1')
plt.title('Count of Purchases by Product Category and Purchase Method')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Step 1: Select only numerical columns
numerical_cols = ['Discount Amount (INR)', 'Gross Amount', 'Net Amount']

# Step 2: Calculate the correlation matrix
correlation_matrix = data[numerical_cols].corr()

# Step 3: Plot the correlation matrix
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder

# Copy the data to avoid altering the original dataset
encoded_data = data.copy()

# Step 1: Encode categorical columns
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_cols = ['Gender', 'Age Group', 'Product Category', 'Purchase Method', 'Discount Availed']

# Apply label encoding to each categorical column
for col in categorical_cols:
    encoded_data[col] = label_encoder.fit_transform(encoded_data[col])

# Step 2: Drop remaining non-numeric columns like 'Purchase Date', 'Location', and 'Discount Name'
encoded_data = encoded_data.drop(columns=['Purchase Date', 'Location', 'Discount Name'])

# Step 3: Recalculate the correlation matrix
correlation_matrix_expanded = encoded_data.corr()

# Step 4: Plot the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix_expanded, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Expanded Correlation Matrix (with Encoded Categorical Fields)')
plt.show()

In [None]:
# Step 1: Ensure 'Purchase Date' is converted to datetime
data['Purchase Date'] = pd.to_datetime(data['Purchase Date'], format='%d/%m/%Y %H:%M:%S', errors='coerce')

# Step 2: Extract time components from the 'Purchase Date'
data['Purchase Year'] = data['Purchase Date'].dt.year
data['Purchase Month'] = data['Purchase Date'].dt.month
data['Day of Week'] = data['Purchase Date'].dt.day_name()

# Step 3: Analyze trends by plotting counts of Product Category across time components
# We will use the original 'Product Category' column from 'data' to ensure category names are displayed

# Plot for Product Category by Year
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Purchase Year', hue='Product Category', palette='Set3')
plt.title('Product Category Purchases by Year')
plt.xticks(rotation=45)
plt.show()

# Plot for Product Category by Month
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Purchase Month', hue='Product Category', palette='Set2')
plt.title('Product Category Purchases by Month')
plt.xticks(rotation=45)
plt.show()

# Plot for Product Category by Day of Week
plt.figure(figsize=(10, 6))
sns.countplot(data=data, x='Day of Week', hue='Product Category', palette='Set1')
plt.title('Product Category Purchases by Day of the Week')
plt.xticks(rotation=45)
plt.show()

In [None]:


# Step 1: Create a new column indicating whether a discount was availed (Yes = 1, No = 0)
data['Discount Availed Binary'] = data['Discount Availed'].apply(lambda x: 1 if x == 'Yes' else 0)

# Step 2: Group by Product Category and calculate the mean percentage of discounts availed
discount_by_category = data.groupby('Product Category')['Discount Availed Binary'].mean().reset_index()

# Convert to percentage
discount_by_category['Discount Percentage'] = discount_by_category['Discount Availed Binary'] * 100

# Step 3: Plot the percentage of discounts availed by product category
plt.figure(figsize=(10, 6))
sns.barplot(x='Product Category', y='Discount Percentage', data=discount_by_category, palette='Set2')
plt.title('Percentage of Transactions with Discounts by Product Category')
plt.ylabel('Percentage of Discounts (%)')
plt.xticks(rotation=45)
plt.show()