<a href="https://colab.research.google.com/github/bhupeshks21/projects/blob/main/blinkit_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# @markdown BLinkit Analysis
# @markdown importing librarires

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# @title raw data
df=pd.read_csv("/content/blinkit_data.csv")

In [None]:
# @title sample data
df.head(10)

In [None]:
df.shape

In [None]:
# @title data field
df.columns

In [None]:
df.dtypes

In [None]:
# @title cleaning
print(df['Item Fat Content'].unique())


In [None]:
df['Item Fat Content']=df['Item Fat Content'].replace({'LF': 'Low Fat','low fat' : 'Low Fat' ,'reg':  'Regular'})
# Check for missing values systematically
print(df.isnull().sum())

# Handle missing 'Item Weight': Impute by the average weight of items in the same 'Item Type'
df['Item Weight'] = df.groupby('Item Type')['Item Weight'].transform(lambda x: x.fillna(x.mean()))

# Create a New Feature: Outlet Age (This is more meaningful than establishment year)
current_year = pd.Timestamp.now().year
df['Outlet_Age'] = current_year - df['Outlet Establishment Year']

# Create a New Feature: Item Visibility Ratio (Visibility relative to other products in the same outlet)
# A product with 0.1 visibility in an outlet where the avg is 0.05 is highly visible.
df['Visibility_Rank'] = df.groupby('Outlet Identifier')['Item Visibility'].rank(pct=True)

In [None]:
# @title Business Requirements
# total sales
total_sales =df['Sales'].sum()
avg_sales =df['Sales'].mean()
#nof od items sales
no_of_items_sold=df['Sales'].count()
#Average_rating
avg_ratings=df['Rating'].mean()

print(f"Total Sales : ${total_sales:,.0f}")
print(f"avg_sales  : {avg_sales:,.0f}")
print(f"no_of_items_sold : {no_of_items_sold:,.0f}")
print(f"avg_rating : {avg_ratings:,.1f}")

In [None]:
# @title Chart requirememt

sales_by_fat=df.groupby('Item Fat Content')['Sales'].sum()
plt.pie(sales_by_fat,labels=sales_by_fat.index,
        autopct = '%.1f%%',
        startangle=90)
plt.title('sales by fat content')
plt.axis('equal')
plt.show()

In [None]:
sales_by_type = df.groupby('Item Type')['Sales'].sum().sort_values(ascending=False)

plt.figure(figsize=(10, 6))
bars = plt.bar(sales_by_type.index, sales_by_type.values)

plt.xticks(rotation=-90)
plt.xlabel('Item Type')
plt.ylabel('Total Sales')
plt.title('Total Sales by Item Type')

for bar in bars:
    plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(),
    f'{bar.get_height():.0f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
grouped = df.groupby(['Outlet Location Type', 'Item Fat Content'])['Sales'].sum().unstack()
grouped = grouped[['Regular', 'Low Fat']]
ax = grouped.plot(kind='bar', figsize=(8, 5), title='Outlet Tier by Item Fat Content')
plt.xlabel('Outlet Location Tier')
plt.ylabel('Total Sales')
plt.legend(title='Item Fat Content')
plt.tight_layout()
plt.show()

In [None]:
sales_by_year = df.groupby('Outlet Establishment Year')['Sales'].sum().sort_index()

plt.figure(figsize=(9,5))
plt.plot(sales_by_year.index, sales_by_year.values, marker='o', linestyle='-')

plt.xlabel('Outlet Establishment Year')
plt.ylabel('Total Sales')
plt.title('Outlet Establishment')

for x, y in zip(sales_by_year.index, sales_by_year.values):
    plt.text(x, y, f'{y:,.0f}', ha='center', va='bottom', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
# Select numerical columns for correlation
numerical_cols = ['Item Visibility', 'Item Weight', 'Outlet_Age', 'Sales', 'Rating']
corr_matrix = df[numerical_cols].corr()

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
from scipy import stats

# Hypothesis: Do Low Fat and Regular products have significantly different sales?
low_fat_sales = df[df['Item Fat Content'] == 'Low Fat']['Sales']
regular_satsales = df[df['Item Fat Content'] == 'Regular']['Sales']

t_stat, p_value = stats.ttest_ind(low_fat_sales, regular_satsales)
print(f"T-statistic: {t_stat:.4f}, P-value: {p_value:.4f}")

if p_value < 0.05:
    print("✅ There is a statistically significant difference in sales between Low Fat and Regular products.")
else:
    print("❌ There is NO statistically significant difference in sales.")

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Outlet Type', y='Sales')
plt.title('Sales Distribution Across Different Outlet Types')
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.regplot(data=df, x='Outlet_Age', y='Sales', scatter_kws={'alpha':0.4}, line_kws={"color": "red"})
plt.title('Impact of Outlet Age on Sales')
plt.xlabel('Outlet Age (Years)')
plt.ylabel('Sales')
plt.show()

In [None]:
# Create a pivot table
pivot_table = df.pivot_table(values='Sales', index='Outlet Location Type', columns='Outlet Size', aggfunc='mean')

# Plot the heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(pivot_table, annot=True, fmt='.1f', cmap='YlGnBu', cbar_kws={'label': 'Average Sales'})
plt.title('Average Sales: Tier vs. Outlet Size')
plt.tight_layout()
plt.show()