In [None]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Read the CSV file
df = pd.read_csv('/kaggle/input/brand-laptops-dataset/laptops.csv')
df.head()

In [None]:
df.columns

In [None]:
# Make names of the columns consistent
df = df.rename(columns={'Model': 'model',
                        'Price': 'price',
                        'Rating': 'rating'})
df.columns

In [None]:
# Get rid of the 'index' column
df = df.drop(columns=['index'])
df.head()

In [None]:
# Check for null values
df.isnull().sum()

In [None]:
# Check df info
df.info()

In [None]:
# Check for duplicated values
df.duplicated().sum()

In [None]:
# Let's explore the 'brand' feature
df['brand'].value_counts()

In [None]:
brand_counts = df['brand'].value_counts().reset_index()
brand_counts = brand_counts.sort_values(by='count', ascending=False)

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(10, 7))

sns.countplot(data=df, x='brand', order=brand_counts['brand'])

plt.title('Distribution of Brands', fontsize=16)
plt.xlabel('Brand', fontsize=14)
plt.ylabel('Count', fontsize=14)
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

In [None]:
brand_stats = df.groupby('brand')['price'].agg(['mean', 'median'])

brand_stats.sort_values(by=['mean', 'median'], ascending=[False, False]).reset_index()

In [None]:
sns.set_palette('Set2')
plt.figure(figsize=(10, 7))

sns.histplot(brand_stats['mean'], bins=15, kde=True)
plt.show()

In [None]:
plt.figure(figsize=(10, 7))

sns.histplot(brand_stats['median'], bins=15, kde=True)
plt.show()

In [None]:
# let's explore the 'model' feature
len(df['model'].unique())

In [None]:
df['model']

In [None]:
# Let's explore the 'price' column
df['price'].describe()

In [None]:
plt.figure(figsize=(10, 7))

sns.histplot(df['price'], bins=round(np.sqrt(len(df))), kde=True)
plt.title('Histogram of Price', fontsize=16)
plt.xlabel('Price', fontsize=14)
plt.ylabel('Frequency', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(16, 12))

sns.boxplot(data=df, x='price', y='brand')
plt.xlabel('Price')
plt.ylabel('Brand')
plt.title('Box Plot of Price by Brand')

plt.show()

In [None]:
# Let's check out the outliers in 'price' column
Q1 = df['price'].quantile(0.25)
Q3 = df['price'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

upper_bound, lower_bound

Since the lower bound is negative, I am not gonna use it to find outliers (as we are dealing with prices)

In [None]:
price_outliers = df.query('price > @upper_bound')
price_outliers

In [None]:
price_outliers['brand'].value_counts().reset_index()

In [None]:
price_outliers.query('brand == "dell"')['price']

In [None]:
pd.set_option('display.max_rows', len(price_outliers))
price_outliers[['brand', 'price']].sort_values(by='price', ascending=False).reset_index()

In [None]:
# Let's explore the 'rating' column
df['rating'].max(), df['rating'].min(), df['rating'].mean(), df['rating'].median()

In [None]:
plt.figure(figsize=(12, 7))

sns.histplot(data=df, x='rating', bins=round(np.sqrt(len(df))), kde=True)
plt.title('Histogram of Ratings', fontsize=16)
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Count', fontsize=14)

plt.show()

In [None]:
rating_stats = df.groupby('brand')['rating'].agg(['mean', 'median'])
rating_stats = rating_stats.sort_values(by=['mean', 'median'], ascending=[False, False]).reset_index()

rating_stats

In [None]:
plt.figure(figsize=(12, 7))
sns.boxplot(data=df, x='rating', y='brand')
plt.xlabel('Rating', fontsize=14)
plt.ylabel('Brand', fontsize=14)
plt.title('Boxplot of Rating wrt Brand', fontsize=16)
plt.tight_layout()
plt.show()