In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
df_sales = pd.read_csv(r"C:\Users\bhara\Documents\Main flow internship\Task 2\sales_data.csv",encoding="ISO-8859-1")

In [None]:
df_sales.info(), df_sales.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113036 entries, 0 to 113035
Data columns (total 18 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Date              113036 non-null  object
 1   Day               113036 non-null  int64 
 2   Month             113036 non-null  object
 3   Year              113036 non-null  int64 
 4   Customer_Age      113036 non-null  int64 
 5   Age_Group         113036 non-null  object
 6   Customer_Gender   113036 non-null  object
 7   Country           113036 non-null  object
 8   State             113036 non-null  object
 9   Product_Category  113036 non-null  object
 10  Sub_Category      113036 non-null  object
 11  Product           113036 non-null  object
 12  Order_Quantity    113036 non-null  int64 
 13  Unit_Cost         113036 non-null  int64 
 14  Unit_Price        113036 non-null  int64 
 15  Profit            113036 non-null  int64 
 16  Cost              113036 non-null  int

In [None]:
# Step 1: Data Cleaning

# Convert 'Date' to datetime format
df_sales['Date'] = pd.to_datetime(df_sales['Date'], errors='coerce')

# Check for duplicates and remove them
df_sales = df_sales.drop_duplicates()

# Check missing values after cleaning
cleaned_missing_values = df_sales.isnull().sum()

# Display the number of rows before and after cleaning
original_rows = 113036  # Original dataset size
cleaned_rows = df_sales.shape[0]

original_rows, cleaned_rows, cleaned_missing_values


In [None]:
original_rows, cleaned_rows, cleaned_missing_values

(113036, 112036, Date                0
Day                 0
Month               0
Year                0
Customer_Age        0
Age_Group           0
Customer_Gender     0
Country             0
State               0
Product_Category    0
Sub_Category        0
Product             0
Order_Quantity      0
Unit_Cost           0
Unit_Price          0
Profit              0
Cost                0
Revenue             0
dtype: int64)


In [None]:
# Step 2: Time-Series Analysis - Sales Trends Over Time

# Aggregate sales by date
sales_trend = df_sales.groupby('Date')['Revenue'].sum()

# Plot the sales trend
plt.figure(figsize=(12, 6))
plt.plot(sales_trend, marker='o', linestyle='-', alpha=0.7)
plt.title("Sales Trend Over Time")
plt.xlabel("Date")
plt.ylabel("Total Revenue")
plt.grid(True)
plt.show()


In [None]:
# Step 3: Profit vs. Revenue Analysis

# Scatter plot of Profit vs. Revenue
plt.figure(figsize=(8, 6))
plt.scatter(df_sales['Revenue'], df_sales['Profit'], alpha=0.5)
plt.title("Profit vs. Revenue")
plt.xlabel("Revenue")
plt.ylabel("Profit")
plt.grid(True)
plt.show()


In [None]:
# Step 4: Sales Performance by Region and Product Category

# Aggregate revenue by country
sales_by_country = df_sales.groupby("Country")["Revenue"].sum().sort_values(ascending=False)

# Aggregate revenue by product category
sales_by_category = df_sales.groupby("Product_Category")["Revenue"].sum().sort_values(ascending=False)

# Plot sales by country
plt.figure(figsize=(10, 5))
sales_by_country.head(10).plot(kind='bar', color='skyblue', edgecolor='black')
plt.title("Top 10 Countries by Sales Revenue")
plt.xlabel("Country")
plt.ylabel("Total Revenue")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Plot sales by product category
plt.figure(figsize=(8, 5))
sales_by_category.plot(kind='bar', color='lightcoral', edgecolor='black')
plt.title("Sales by Product Category")
plt.xlabel("Product Category")
plt.ylabel("Total Revenue")
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Step 5: Predictive Modeling - Linear Regression for Sales Prediction

# Select relevant features and target variable
X = df_sales[['Profit', 'Cost']]
y = df_sales['Revenue']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

r2, mse
