In [None]:
# Import required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Load the dataset

In [None]:
df = pd.read_csv("BigMart_sales.csv")

In [None]:
# check the data avaiable in the dataset
df.head()

## Exploratory Data Analysis

### Check shape of the dataset:

* It is a good idea to first check the shape of the dataset.

In [None]:
# print the shape
print('The shape of the dataset : ', df.shape)

Now, we can see that the dataset contains 8523 rows and 12 columns.

### Summary of dataset

In [None]:
# summary of dataset
df.info()

### Check the data types of columns
* The above df.info() command gives us the number of filled values along with the data types of columns.

* If we simply want to check the data type of a particular column, we can use the following command.

In [None]:
# datatypes of columns
df.dtypes

### Statistical properties of dataset

In [None]:
# statistical properties of dataset
df.describe()

**Important points to note**
1. The above command df.describe() helps us to view the statistical properties of numerical variables. It excludes character variables.

2. If we want to view the statistical properties of character variables, we should run the following command -

`df.describe(include=['object'])`

3. If we want to view the statistical properties of all the variables, we should run the following command -

`df.describe(include='all')`

In [None]:
df.describe(include='all')

### View column names

In [None]:
# columns
df.columns

Here, the target variable is Item_Outlet_Sales.

In [None]:
# Check Duplicates
df.duplicated().sum()

### Missing values

In [None]:
df.isnull().sum()

In [None]:
# Calculate the percentage of missing values
missing_percentage = (df.isnull().sum() / len(df)) * 100

# Print the result
print(missing_percentage)

In [None]:
# Distribution of Item_Weight
plt.figure(figsize=(6, 4))
sns.histplot(df['Item_Weight'], kde=True, bins=30)
plt.title('Distribution of Item Weight')
plt.xlabel('Item Weight')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Distribution of Outlet_Size
plt.figure(figsize=(6, 4))
sns.countplot(df['Outlet_Size'])
plt.title('Distribution of Item Outlet Size')
plt.xlabel('Outlet_Size')
plt.ylabel('Frequency')
plt.show()

Here, the data is uniformly distributed. Directly imputing 1463, 2410 missing values with the mean or median in numerical column and mode in categorical column would introduce bias into the dataset.

One potential solution is to use a more advanced imputation technique such as K-nearest neighbors (KNN) imputation. With KNN imputation, missing values are filled based on the values of the nearest neighbors in the feature space.

You will learn these topics in future classes. For now, we will move on to the next steps of the EDA.

## Univariate Analysis

### Distribution of Target Variable

In [None]:
# Distribution of Item_Outlet_Sales
plt.figure(figsize=(6, 4))
sns.histplot(df['Item_Outlet_Sales'], kde=True, bins=30)
plt.title('Distribution of Item Outlet Sales')
plt.xlabel('Item_Outlet_Sales')
plt.ylabel('Frequency')
plt.show()

You can observe the above distribution is right skewed.

Now, try to find the number of distinct values in all the columns.

In [None]:
# Number of distinct elements in each column
df.nunique()

In [None]:
# Value counts of all columns
for i in df.columns:
  print(df[i].value_counts())

Something is odd in Item_Fat_Content. Explore the Item_Fat_Content column seperately.

In [None]:
# Value counts of Item_Fat_Content column
df['Item_Fat_Content'].value_counts()

Correct the spelling mistakes of the Low Fat and Regular categories.

In [None]:
# Modidy 'LF' to 'Low Fat', 'low fat' to 'Low Fat' and 'reg' = 'Regular'
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'low fat': 'Low Fat', 'reg': 'Regular'})

In [None]:
# Check the Item_Fat_Content value counts
df['Item_Fat_Content'].value_counts()

### Categorical Variable Analysis

In [None]:
# Count plots for categorical variables
categorical_columns = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']

for col in categorical_columns:
    plt.figure(figsize=(6, 4))
    sns.countplot(x=df[col])
    plt.title(f'Count Plot of {col}')
    plt.xticks(rotation=90)
    plt.show()


## Bivariate Analysis

### Correlation Heatmap

In [None]:
# Filter out non-numeric columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Correlation matrix
plt.figure(figsize=(6, 6))
correlation_matrix = numeric_df.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Heatmap')
plt.show()


### Item_Weight vs Item_Outlet_Sales

In [None]:
# Scatter plot
plt.figure(figsize=(6, 4))
sns.scatterplot(x='Item_Weight', y='Item_Outlet_Sales', data=df)
plt.title('Item Weight vs Item Outlet Sales')
plt.xlabel('Item_Weight')
plt.ylabel('Item_Outlet_Sales')
plt.show()

### Item_MRP vs Item_Outlet_Sales

In [None]:
# Scatter plot
plt.figure(figsize=(6, 4))
sns.scatterplot(x='Item_MRP', y='Item_Outlet_Sales', data=df)
plt.title('Item MRP vs Item Outlet Sales')
plt.xlabel('Item_MRP')
plt.ylabel('Item_Outlet_Sales')
plt.show()

### Categorical Variables vs Item_Outlet_Sales

In [None]:
# Box plots for categorical variables
for col in categorical_columns:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col], y=df['Item_Outlet_Sales'])
    plt.title(f'{col} vs Item Outlet Sales')
    plt.xticks(rotation=90)
    plt.show()

## Multivariate Analysis

### Pairplot

In [None]:
# Pairplot for selected features
selected_features = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']
sns.pairplot(df[selected_features])
plt.show()


### Multivariate Analysis with Categorical Variables

In [None]:
# Using groupby to see the mean sales by different categories
for col in categorical_columns:
    grouped_data = df.groupby(col)['Item_Outlet_Sales'].mean().reset_index()
    plt.figure(figsize=( 6, 4))
    sns.barplot(x=col, y='Item_Outlet_Sales', data=grouped_data)
    plt.title(f'Mean Item Outlet Sales by {col}')
    plt.xticks(rotation=90)
    plt.show()
