# 📊 Data Analysis and Data Science Task - 2
By: Bhawana Rautela

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import warnings
warnings.filterwarnings("ignore")


## 🔍 Project 1: General EDA

In [None]:
# Load dataset
df = pd.read_csv("general_eda_data.csv")
df.head()


### 🧹 Data Cleaning

In [None]:
# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Fill missing values
df["Sales"].fillna(df["Sales"].mean(), inplace=True)
df["Profit"].fillna(df["Profit"].median(), inplace=True)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Handle outliers using IQR
Q1 = df["Sales"].quantile(0.25)
Q3 = df["Sales"].quantile(0.75)
IQR = Q3 - Q1
df = df[(df["Sales"] >= Q1 - 1.5*IQR) & (df["Sales"] <= Q3 + 1.5*IQR)]

# Same for Profit
Q1 = df["Profit"].quantile(0.25)
Q3 = df["Profit"].quantile(0.75)
IQR = Q3 - Q1
df = df[(df["Profit"] >= Q1 - 1.5*IQR) & (df["Profit"] <= Q3 + 1.5*IQR)]


### 📈 Statistical Summary

In [None]:
# Summary statistics
print(df.describe())

# Correlation heatmap
plt.figure(figsize=(6, 4))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()


### 📊 Visualizations

In [None]:
# Histogram
df["Sales"].hist(bins=20, edgecolor='black')
plt.title("Sales Distribution")
plt.xlabel("Sales")
plt.ylabel("Frequency")
plt.show()

# Boxplot
sns.boxplot(x="Category", y="Profit", data=df)
plt.title("Profit Distribution by Category")
plt.show()


## 📈 Project 2: Sales Performance Analysis

In [None]:
# Load dataset
sales_df = pd.read_csv("sales_data.csv")
sales_df["Date"] = pd.to_datetime(sales_df["Date"])
sales_df.head()


### 📊 Visualizations

In [None]:
# Time Series
plt.plot(sales_df["Date"], sales_df["Sales"], marker='o')
plt.title("Sales Over Time")
plt.xlabel("Date")
plt.ylabel("Sales")
plt.grid()
plt.show()

# Profit vs Discount
sns.scatterplot(x="Discount", y="Profit", data=sales_df)
plt.title("Profit vs Discount")
plt.show()

# Sales by Region
sales_df.groupby("Region")["Sales"].sum().plot(kind="bar", color="skyblue")
plt.title("Sales by Region")
plt.ylabel("Total Sales")
plt.show()


### 🤖 Predictive Modeling

In [None]:
# Train Linear Regression Model
X = sales_df[["Profit", "Discount"]]
y = sales_df["Sales"]
model = LinearRegression()
model.fit(X, y)
predictions = model.predict(X)

# Evaluation
print("R² Score:", r2_score(y, predictions))
print("Mean Squared Error:", mean_squared_error(y, predictions))


## ✅ Final Insights and Recommendations

- Sales trend is increasing over time.
- Discount has a slight negative correlation with profit.
- North and West regions show higher overall sales.
- Categories like Technology and Furniture perform well.
- The regression model shows a strong relationship between Profit, Discount, and Sales.
