### Importing Required Libraries:

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

### Importing the dataset:

In [None]:
sales_data = pd.read_csv('da.csv', on_bad_lines='skip')
sales_data

In [None]:
# Checking the Shape
sales_data.shape

In [None]:
sales_data.columns

In [None]:
sales_data.info()

In [None]:
# Checking no. of Null Values
sales_data.isnull().sum()

In [None]:
sales_data01 = sales_data.copy()

In [None]:
sales_data01.columns

In [None]:
sales_data01['Item Class']

In [None]:
sales_data01['Item Class'].value_counts()

In [None]:
# Removing Null Values
sales_data01.dropna(subset=['Discount Amount','Sales Price', 'Item Number'],inplace=True)

In [None]:
# Generating descriptive statistics
sales_data01.describe()

In [None]:
import pandas as pd

# Ensure the 'Invoice Date' column is in datetime format
sales_data01['Invoice Date'] = pd.to_datetime(sales_data01['Invoice Date'], errors='coerce')

# Create Year, Month, Quarter, and Day columns
sales_data01['Invoice_Year'] = sales_data01['Invoice Date'].dt.year
sales_data01['Invoice_Month'] = sales_data01['Invoice Date'].dt.month
sales_data01['Invoice_Quarter'] = sales_data01['Invoice Date'].dt.quarter
sales_data01['Invoice_Day'] = sales_data01['Invoice Date'].dt.day

# Verify the results by printing the first few rows
print(sales_data01[['Invoice Date', 'Invoice_Year', 'Invoice_Month', 'Invoice_Quarter', 'Invoice_Day']].head())


In [None]:
sales_data01.info()

In [None]:
# Create a new DataFrame with the necessary columns
sales_data02 = sales_data01[['Custkey', 'Item', 'Invoice Date', 'Invoice_Year', 'Invoice_Quarter', 'Invoice_Month', 
                             'Invoice_Day', 'Sales Quantity', 'Sales Amount Based on List Price', 'Discount Amount', 
                             'Sales Amount', 'Sales Margin Amount', 'Sales Cost Amount', 'Sales Rep', 'U/M', 
                             'List Price', 'Sales Price']]

# Display the first few rows to verify the result
print(sales_data02.head())


In [None]:
sales_data02.isnull().sum()

In [None]:
# set style
sns.set(style = 'darkgrid')

In [None]:
DaySalesInsights = sales_data01.copy()
DaySalesInsights['Invoice_Date'] = pd.to_datetime(sales_data01['Invoice Date']).dt.date

In [None]:
# Exclude 'Invoice Date' when summing to avoid datetime issues
top10sales = DaySalesInsights.groupby('Invoice Date').sum(numeric_only=True).sort_values('Sales Amount', ascending=False)

# Reset index and get the top 10 sales
top10sales = top10sales.reset_index().head(10)

# Check out the top 10 sales!
print(top10sales)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create the catplot with a single color
sns.catplot(y='Sales Amount', x='Invoice Date', data=top10sales, aspect=2, color='blue', kind='bar')

# Set the title
plt.title('Top 10 Days When Sales Were Highest')

# Display the plot
plt.show()


In [None]:
# Group by 'Invoice Date' and sum up sales margin amounts
top10profits = DaySalesInsights.groupby('Invoice Date').sum(numeric_only=True).sort_values('Sales Margin Amount', ascending=False)

# Reset index to make 'Invoice Date' a column again
top10profits = top10profits.reset_index()

# Get the top 10 days by 'Sales Margin Amount'
top10profits = top10profits.head(10)

# Display the result
print(top10profits)


In [None]:
# Group by 'Invoice_Date' and sum up numeric values
top10profits = DaySalesInsights.groupby('Invoice_Date').sum(numeric_only=True).sort_values('Sales Margin Amount', ascending=False)

# Reset index to make 'Invoice_Date' a column again
top10profits = top10profits.reset_index()

# Get the top 10 days by 'Sales Margin Amount'
top10profits = top10profits.head(10)

# Display the result
print(top10profits)


In [None]:
sns.catplot(y = 'Sales Margin Amount', x = 'Invoice_Date', data = top10profits, aspect = 2,color='blue',kind="bar")
plt.title('Top 10 Days When Profits Were Highest')
top10profits[['Sales Margin Amount']]

In [None]:
# Group by 'Invoice_Date' and sum up numeric values
highqty = DaySalesInsights.groupby('Invoice_Date').sum(numeric_only=True).sort_values('Sales Quantity', ascending=False)

# Reset index to make 'Invoice_Date' a column again
highqty = highqty.reset_index()

# Get the top 10 days with the highest 'Sales Quantity'
highqty = highqty.head(10)

# Display the result
print(highqty)


In [None]:
sns.catplot(y = 'Sales Quantity', x = 'Invoice_Date', data = highqty, aspect = 2,color='blue',kind="bar")
plt.title('Top 10 Days When Highest Quantity Of Items Were Sold')
highqty[['Sales Quantity']]

In [None]:
# Group by 'Invoice_Date' and count the number of occurrences of 'CustKey'
MostCust = DaySalesInsights.groupby('Invoice_Date').size()

# Convert Series to DataFrame
MostCust = MostCust.reset_index(name='Customer_Count')

# Sort the results by 'Customer_Count' in descending order
MostCust = MostCust.sort_values('Customer_Count', ascending=False)

# Get the top 10 days with the highest number of customers
MostCust = MostCust.head(10)

# Display the result
print(MostCust)


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure 'Customer_Count' column exists if you followed the updated code
sns.catplot(y='Customer_Count', x='Invoice_Date', data=MostCust, aspect=2, color='blue', kind='bar')

plt.title('Top 10 Days When Most Customers Ordered Items')
plt.ylabel('No. of Customers')
plt.xlabel('Invoice Date')  # Add x-axis label for clarity
plt.xticks(rotation=45)     # Rotate x-axis labels for better readability
plt.show()


In [None]:
Yearly_Sales =  sales_data02[['Custkey','Item','Invoice Date','Invoice_Year','Invoice_Month',
                            'Sales Quantity', 'Sales Amount Based on List Price','Discount Amount',
                           'Sales Amount', 'Sales Margin Amount','Sales Cost Amount','Sales Rep','U/M','List Price',
                           'Sales Price']]


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure 'Invoice_Year' is properly set as a column and convert it to numeric if necessary
Yearly_Sales['Invoice_Year'] = Yearly_Sales['Invoice_Year'].astype(int)

# Drop non-numeric columns for the aggregation
numeric_columns = ['Sales Amount']  # Add other numeric columns if needed
Yearly_Sales_numeric = Yearly_Sales[numeric_columns + ['Invoice_Year']]

# Group by 'Invoice_Year' and sum the 'Sales Amount'
Yearly_Sales01 = Yearly_Sales_numeric.groupby('Invoice_Year').sum().reset_index()

# Create the bar plot
sns.catplot(
    y='Sales Amount', 
    x='Invoice_Year', 
    data=Yearly_Sales01, 
    color='blue', 
    kind='bar',
    aspect=2  # Aspect ratio to adjust the size
)

# Add labels and title
plt.xlabel('Year')
plt.ylabel('Sales Amount')
plt.title('Yearly Sales')

# Show the plot
plt.show()

# Display the DataFrame
Yearly_Sales01[['Invoice_Year', 'Sales Amount']]


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Grouping the data
grouped_data = sales_data02.groupby(['Invoice Date', 'Invoice_Year', 'Invoice_Month']).sum().reset_index()

# Set the figure size
plt.figure(figsize=(15, 5))

# Create the line plot with no markers or extra features
sns.lineplot(y='Sales Amount', x='Invoice_Month', data=grouped_data, 
             hue='Invoice_Year', palette='bright', legend=False)  # Remove legend for clarity

# Improve plot clarity:
plt.xlabel('Invoice Month', fontsize=12)
plt.ylabel('Sales Amount', fontsize=12)
plt.title('Sales Trends Over Different Months by Year', fontsize=14)

# Remove any grid or shadow for a cleaner look
plt.grid(False)

# Display the plot
plt.show()


### Obseravtions:
*  From this plot, it is clear that Sales decreased continuously & then it started to increase from 2018 to 2019.

In [None]:
#Plotting Piechart to know Sales Share among 3 years
plt.figure(figsize=(17,6))
plt.pie('Sales Amount',labels='Invoice_Year',data=Yearly_Sales01,
        autopct='%1.2f%%',shadow=True,startangle=90)
plt.axis('equal')
plt.title('Sales Contribution')
plt.legend(round(Yearly_Sales01['Sales Amount'],2), loc=7, fontsize = 'xx-large')
plt.show()

## Monthly Records: 

In [None]:
import pandas as pd

# Ensure numeric columns are correctly typed
sales_data02['Sales Quantity'] = pd.to_numeric(sales_data02['Sales Quantity'], errors='coerce')
sales_data02['Sales Amount'] = pd.to_numeric(sales_data02['Sales Amount'], errors='coerce')

# Group by Year, Month, and Day, then sum only numeric columns
Monthly_sales = sales_data02.groupby(['Invoice_Year', 'Invoice_Month', 'Invoice_Day']).sum(numeric_only=True).reset_index()

# Get a statistical summary of the numeric columns
summary_stats = Monthly_sales.describe()

# Display the summary statistics
print(summary_stats)


In [None]:
sales_data.columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Ensure Yearly_Sales01 DataFrame is prepared correctly
Yearly_Sales01 = sales_data02.groupby('Invoice_Year').sum(numeric_only=True).reset_index()

# Create the bar plot
sns.catplot(
    y='Sales Margin Amount', 
    x='Invoice_Year', 
    data=Yearly_Sales01,
    kind="bar",
    palette='coolwarm'  # You can adjust the palette as needed
)

plt.xlabel('Year')
plt.ylabel('Sales Margin Amount')
plt.title('Yearly Profits')

# Show the plot
plt.show()

# Display the relevant columns of the DataFrame
print(Yearly_Sales01[['Invoice_Year', 'Sales Margin Amount']])


In [None]:
plt.figure(figsize=(10,6))
plt.pie('Sales Margin Amount',labels='Invoice_Year',data=Yearly_Sales01[['Invoice_Year', 'Sales Margin Amount']],
        autopct='%1.2f%%',shadow=True,startangle=90)
plt.axis('equal')
plt.title('Profit Share')
plt.show()

## Yearly Month wise Records:

In [None]:
sns.relplot(x ='Invoice_Month',y = 'Sales Margin Amount', data=Yearly_Monthwise_Sales,height=5,
            kind = 'line', aspect = 1, col = 'Invoice_Year')
plt.xlabel('Month')
plt.ylabel('Sales Margin Amount')
print('*'*40+'Yearly-Month-wise Total Sales Margin Trend'+'*'*40)

In [None]:
# Filter data for the year 2017
Top10byCustKey17 = Yearly_Sales[Yearly_Sales['Invoice_Year'] == 2017]

# Group by 'Invoice_Year' and 'Custkey', and sum the numeric columns
Top10byCustKey17 = Top10byCustKey17.groupby(['Invoice_Year', 'Custkey']).sum(numeric_only=True)

# Sort by 'Sales Margin Amount' in descending order and get the top 10
Top10byCustKey17 = Top10byCustKey17.sort_values('Sales Margin Amount', ascending=False).reset_index().head(10)

# Display the top 10 customers
print(Top10byCustKey17)


In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x='Custkey', y='Sales Margin Amount',data = Top10byCustkey18, palette = 'turbo',
            order = Top10byCustkey18.Custkey)
Top10byCustkey18[['Custkey', 'Sales Margin Amount']]

###  Observations:
* Among the 10 CustKey that generated the most Sales Margin Amount, `CustKey-10025039` contributed around 30.79% of the Sales Margin Amount.

In [None]:
# Verify column names first
print(Yearly_Sales.columns)

# Assuming 'Custkey' is the correct column name
Top10byCustKey19 = Yearly_Sales[Yearly_Sales['Invoice_Year'] == 2019] \
    .groupby(['Invoice_Year', 'Custkey']) \
    .sum(numeric_only=True) \
    .sort_values('Sales Margin Amount', ascending=False) \
    .reset_index() \
    .head(10)

# Plot the data
plt.figure(figsize=(10, 5))
sns.barplot(x='Custkey', y='Sales Margin Amount', data=Top10byCustKey19, palette='turbo', order=Top10byCustKey19['Custkey'])
plt.xlabel('Custkey')
plt.ylabel('Sales Margin Amount')
plt.title('Top 10 Customers by Sales Margin Amount for 2019')
plt.xticks(rotation=45)
plt.show()

# Display the data used for plotting
print(Top10byCustKey19[['Custkey', 'Sales Margin Amount']])


###  Observations:
* Among the 10 CustKey that generated the most Sales Margin Amount, `CustKey-10009676` contributed around 30.41% of the Sales Margin Amount.

In [None]:
# Select numeric columns for summing
numeric_columns = sales_data01.select_dtypes(include=['number']).columns

# Group by 'Item' and sum only the numeric columns
High_Profit = sales_data01.groupby('Item')[numeric_columns] \
    .sum() \
    .sort_values('Sales Margin Amount', ascending=False) \
    .reset_index()

# Add a ranking column
High_Profit['Rank'] = range(1, len(High_Profit) + 1)

# Move 'Rank' to the beginning and adjust column names if needed
High_Profit = High_Profit[['Rank'] + [col for col in High_Profit.columns if col != 'Rank']]

# Display the DataFrame
print(High_Profit)


In [None]:
plt.figure(figsize=(8,7))
sns.barplot(x='Rank', y='Sales Margin Amount',data = High_Profit.head(10), palette = 'turbo')
plt.legend(High_Profit['Item'].head(10))
High_Profit[['Rank', 'Item', 'Sales Margin Amount']].head(10)

###  Observations:
* Item `Better Large Canned Shrimp` contributed around 19.21% of Sales Margin amount among the 10 Items that generated the most Sales Margin Amount.

In [None]:
# Select numeric columns for summing
numeric_columns = sales_data01.select_dtypes(include=['number']).columns

# Group by 'Item' and sum only the numeric columns
High_Sales = sales_data01.groupby('Item')[numeric_columns] \
    .sum() \
    .sort_values('Sales Amount', ascending=False) \
    .reset_index()

# Add a ranking column
High_Sales['Rank'] = range(1, len(High_Sales) + 1)

# Reorder columns if necessary to move 'Rank' to the beginning
High_Sales = High_Sales[['Rank'] + [col for col in High_Sales.columns if col != 'Rank']]

# Display the DataFrame
print(High_Sales)


In [None]:
plt.figure(figsize=(10,7))
sns.barplot(x='Rank', y='Sales Amount',data = High_Sales.head(10), palette = 'turbo')
plt.legend(High_Sales['Item'].head(10))
High_Sales[['Rank', 'Item', 'Sales Amount']].head(10)

###  Observations:
* Item `Better Large Canned Shrimp` contributed around 22.11% of revenue among the 10 Items that generated the most revenue.

In [None]:
# Select numeric columns for summing
numeric_columns = sales_data01.select_dtypes(include=['number']).columns

# Group by 'Item' and sum only the numeric columns
High_SalesPrice = sales_data01.groupby('Item')[numeric_columns] \
    .sum() \
    .sort_values('Sales Price', ascending=False) \
    .reset_index()

# Add a ranking column
High_SalesPrice['Rank'] = range(1, len(High_SalesPrice) + 1)

# Reorder columns if necessary to move 'Rank' to the beginning
High_SalesPrice = High_SalesPrice[['Rank'] + [col for col in High_SalesPrice.columns if col != 'Rank']]

# Display the DataFrame
print(High_SalesPrice)


###  Observations:
* Item `Better Fancy Canned Sardines` contributed around 18.43% of Sales Price among the 10 Items.

In [None]:
# Select numeric columns for summing
numeric_columns = sales_data01.select_dtypes(include=['number']).columns

# Group by 'Item' and sum only the numeric columns
High_CostAmount = sales_data01.groupby('Item')[numeric_columns] \
    .sum() \
    .sort_values('Sales Cost Amount', ascending=False) \
    .reset_index()

# Add a ranking column
High_CostAmount['Rank'] = range(1, len(High_CostAmount) + 1)

# Reorder columns if necessary to move 'Rank' to the beginning
High_CostAmount = High_CostAmount[['Rank'] + [col for col in High_CostAmount.columns if col != 'Rank']]

# Display the DataFrame
print(High_CostAmount)


###  Observations:
* Item `Better Large Canned Shrimp` contributed around 23.82% of Sales Cost Amount among the 10 Items.