In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

bike_df = pd.read_csv('BIKE DETAILS.csv')
car_df = pd.read_csv('Car Sale.csv')
amazon_df = pd.read_csv('amazon.csv')
spotify_df = pd.read_csv('spotify.csv')


#EDA - 1


In [None]:
# What is the range of selling prices in the dataset
bike_df['selling_price'].min(), bike_df['selling_price'].max()

In [None]:
# What is the median selling price for bikes in the dataset
bike_df['selling_price'].median()

In [None]:
# What is the most common seller type
bike_df['seller_type'].mode()[0]

In [None]:
# How many bikes have driven more than 50,000 kilometers
bike_df[bike_df['km_driven'] > 50000].shape[0]

In [None]:
# What is the average km_driven value for each ownership type
bike_df.groupby('owner')['km_driven'].mean()

In [None]:
# What proportion of bikes are from the year 2015 or older
bike_df[bike_df['year'] <= 2015].shape[0] / bike_df.shape[0]

In [None]:
# What is the trend of missing values across the dataset
bike_df.isnull().sum()

In [None]:
# What is the highest ex_showroom_price recorded, and for which bike
bike_df.loc[bike_df['ex_showroom_price'].idxmax(), ['name', 'ex_showroom_price']]

In [None]:
# What is the total number of bikes listed by each seller type
bike_df['seller_type'].value_counts()

In [None]:
# What is the relationship between selling_price and km_driven for first-owner bikes
sns.scatterplot(data=bike_df[bike_df['owner'] == '1st owner'], x='km_driven', y='selling_price')
plt.show()

In [None]:
# Identify and remove outliers in the km_driven column using the IQR method
Q1 = bike_df['km_driven'].quantile(0.25)
Q3 = bike_df['km_driven'].quantile(0.75)
IQR = Q3 - Q1
bike_df_no_outliers = bike_df[~((bike_df['km_driven'] < (Q1 - 1.5 * IQR)) | (bike_df['km_driven'] > (Q3 + 1.5 * IQR)))]

In [None]:
# Perform a bivariate analysis to visualize the relationship between year and selling_price
sns.scatterplot(data=bike_df, x='year', y='selling_price')
plt.show()

In [None]:
# What is the average depreciation in selling price based on the bike's age (current year - manufacturing year)
current_year = 2025
bike_df['depreciation'] = bike_df['ex_showroom_price'] - bike_df['selling_price']
bike_df['age'] = current_year - bike_df['year']
bike_df['depreciation_per_year'] = bike_df['depreciation'] / bike_df['age']
bike_df.groupby('year')['depreciation_per_year'].mean()

In [None]:
# Which bike names are priced significantly above the average price for their manufacturing year
avg_price_by_year = bike_df.groupby('year')['selling_price'].transform('mean')
bike_df[bike_df['selling_price'] > avg_price_by_year]['name'].unique()

In [None]:
# Develop a correlation matrix for numeric columns and visualize it using a heatmap.
corr = bike_df.select_dtypes(include='number').corr()
sns.heatmap(corr, annot=True)
plt.show()

#EDA - 2

In [None]:
# What is the average selling price of cars for each dealer, and how does it compare across different dealers
car_df.groupby('Dealer_Name')['Price ($)'].mean()

In [None]:
# Which car brand (Company) has the highest variation in prices, and what does this tell us about the pricing trends
car_df.groupby('Company')['Price ($)'].std().idxmax()

In [None]:
# What is the distribution of car prices for each transmission type, and how do the interquartile ranges compare
car_df.boxplot(column='Price ($)', by='Transmission')
plt.show()

In [None]:
# What is the distribution of car prices across different regions
car_df.boxplot(column='Price ($)', by='Dealer_Region')
plt.show()

In [None]:
# What is the distribution of cars based on body styles
car_df['Body Style'].value_counts()

In [None]:
# How does the average selling price of cars vary by customer gender and annual income
car_df.groupby(['Gender', pd.cut(car_df['Annual Income'], bins=3)])['Price ($)'].mean()

In [None]:
# What is the distribution of car prices by region, and how does the number of cars sold vary by region
car_df.groupby('Dealer_Region')['Price ($)'].agg(['mean', 'count'])

In [None]:
# How does the average car price differ between cars with different engine sizes
car_df.groupby('Engine')['Price ($)'].mean()

In [None]:
# How do car prices vary based on the customer’s annual income bracket
car_df.groupby(pd.cut(car_df['Annual Income'], bins=4))['Price ($)'].agg(['mean', 'count'])

In [None]:
# What are the top 5 car models with the highest number of sales, and how does their price distribution look
top5_models = car_df['Model'].value_counts().nlargest(5).index
sns.boxplot(data=car_df[car_df['Model'].isin(top5_models)], x='Model', y='Price ($)')
plt.show()

In [None]:
# How does car price vary with engine size across different car colors, and which colors have the highest price variation
car_df.groupby('Color')['Price ($)'].std().nlargest(5)

In [None]:
# Is there any seasonal trend in car sales based on the date of sale
car_df['Date'] = pd.to_datetime(car_df['Date'])
car_df['Month'] = car_df['Date'].dt.month
car_df.groupby('Month')['Price ($)'].count()

In [None]:
# How does the car price distribution change when considering different combinations of body style and transmission type
sns.boxplot(data=car_df, x='Body Style', y='Price ($)', hue='Transmission')
plt.show()

In [None]:
# What is the correlation between car price, engine size, and annual income of customers, and how do these features interact
corr = car_df[['Price ($)', 'Annual Income']].corr()
sns.heatmap(corr, annot=True)
plt.show()

In [None]:
# How does the average car price vary across different car models and engine types?
car_df.groupby(['Model', 'Engine'])['Price ($)'].mean()

#EDA - 3

In [None]:
# What is the average rating for each product category
amazon_df.groupby('category')['rating'].mean()

In [None]:
# What are the top rating_count products by category
amazon_df.sort_values('rating_count', ascending=False).groupby('category').head(1)

In [None]:
# What is the distribution of discounted prices vs. actual prices
sns.histplot(amazon_df['discounted_price'], label='Discounted', kde=True)
sns.histplot(amazon_df['actual_price'], label='Actual', kde=True)
plt.legend()
plt.show()

In [None]:
# How does the average discount percentage vary across categories
amazon_df.groupby('category')['discount_percentage'].mean()

In [None]:
# What are the most popular product names
amazon_df['product_name'].value_counts().head(5)

In [None]:
# What are the most popular product keywords
# This requires text processing; simple approach: split product names and count
from collections import Counter
keywords = Counter(" ".join(amazon_df['product_name']).split())
keywords.most_common(5)

In [None]:
# What are the most popular product reviews
amazon_df['review_title'].value_counts().head(5)

In [None]:
# What is the correlation between discounted_price and rating
amazon_df[['discounted_price', 'rating']].corr()

In [None]:
# What are the Top 5 categories based on the highest ratings
amazon_df.groupby('category')['rating'].mean().nlargest(5)

In [None]:
# Identify any potential areas for improvement or optimization based on the data analysis.
# Review the summary statistics
amazon_df.describe(include='all')

#EDA - 4

In [None]:
# Read the dataframe, check null value if present then do the needful, check duplicate row , if present then do the needful
spotify_df.dropna(inplace=True)
spotify_df.drop_duplicates(inplace=True)
spotify_df

In [None]:
# What is the distribution of popularity among the tracks in the dataset? Visualize it using a histogram
sns.histplot(spotify_df['Popularity'])
plt.show()

In [None]:
# Is there any relationship between the popularity and the duration of tracks? Explore this using a scatter plot
sns.scatterplot(data=spotify_df, x='Duration (ms)', y='Popularity')
plt.show()

In [None]:
# Which artist has the highest number of tracks in the dataset? Display the count of tracks for each artist using a countplot
sns.countplot(data=spotify_df, x='Artist')
plt.show()

In [None]:
# What are the top 5 least popular tracks in the dataset? Provide the artist name and track name for each
spotify_df.nsmallest(5, 'Popularity')[['Artist', 'Track Name']]

In [None]:
# Among the top 5 most popular artists, which artist has the highest popularity on average? Calculate and display the average popularity for each artist
top5_artists = spotify_df['Artist'].value_counts().nlargest(5).index
spotify_df[spotify_df['Artist'].isin(top5_artists)].groupby('Artist')['Popularity'].mean()

In [None]:
# For the top 5 most popular artists, what are their most popular tracks? List the track name for each artist
popular_tracks = spotify_df.loc[spotify_df.groupby('Artist')['Popularity'].idxmax()]
popular_tracks[popular_tracks['Artist'].isin(top5_artists)][['Artist', 'Track Name']]

In [None]:
# Visualize relationships between multiple numerical variables simultaneously using a pair plot
sns.pairplot(spotify_df[['Popularity', 'Duration (ms)']])
plt.show()

In [None]:
# Does the duration of tracks vary significantly across different artists? Explore this visually using a box plot or violin plot
sns.boxplot(data=spotify_df, x='Artist', y='Duration (ms)')
plt.show()

In [None]:
# How does the distribution of track popularity vary for different artists? Visualize this using a swarm plot or a violin plot
sns.violinplot(data=spotify_df, x='Artist', y='Popularity')
plt.show()