In [None]:
from math import trunc
from select import KQ_NOTE_WRITE

import polars as pl
import polars.selectors as cs

import altair as alt
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

In [None]:
df_path = r'/Users/zygimantas/Downloads/computer_prices_all.csv'

In [None]:
df = pl.read_csv(df_path)

In [None]:
df.describe()

In [None]:
df.collect_schema()

In [None]:
df.select(
    pl.col('brand')
).group_by(
    'brand'
).len().sort(
    by='len',
    descending=True
)

In [None]:
df.head(5)

In [None]:
df = df.with_columns(
    pl.col('resolution').str.split('x').list.get(0).alias('resolution_width').cast(pl.Int16),
    pl.col('resolution').str.split('x').list.get(1).alias('resolution_height').cast(pl.Int16)
)

In [None]:
df = df.with_columns(
    (datetime.now().year - pl.col('release_year')).alias('device_age')
)

In [None]:
df = df.with_columns(
    pl.when((pl.col('device_type') == 'Desktop') & (pl.col('battery_wh') == 0))
    .then(pl.lit(None))
    .otherwise(pl.col('battery_wh'))
    .alias('battery_wh'),
    pl.when((pl.col('device_type') == 'Laptop') & (pl.col('psu_watts') == 0))
    .then(pl.lit(None))
    .otherwise(pl.col('psu_watts'))
    .alias('psu_watts')
)

In [None]:
df = df.with_columns(
    pl.col('cpu_model'),
    pl.coalesce([
        pl.col('cpu_model').str.extract(r'([A-Za-z]+\s\d+)'),
        pl.col('cpu_model').str.extract(r'([\w]\d)')
    ]).alias('cpu_series')
)

In [None]:
df

The user wants to round the result of the division, not the `storage_gb` column before division. The `round()` method should be applied to the entire expression result.



In [None]:
df = df.with_columns(
    (pl.col('price') / pl.col('storage_gb')).round(2).alias('price_per_gb_storage')
)


In [None]:
df

In [None]:
df.collect_schema()

In [None]:
df.select(
    pl.col('price', 'device_type')
).group_by(
    pl.col('device_type')
).agg(
    pl.sum('price')
)

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(data=df.to_pandas(), x='price', bins=30, kde=True)
plt.title('Distribution of Device Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

price_mean = df.select(pl.col('price').mean()).item()
price_median = df.select(pl.col('price').median()).item()

print(f"Mean Price: ${price_mean:.2f}")
print(f"Median Price: ${price_median:.2f}")
print(
    f"\nThe distribution is right-skewed because the mean (${price_mean:.2f}) is greater than the median (${price_median:.2f}).")
print(
    "This indicates that there are some expensive devices pulling the average up, with the tail extending to the right.")


In [None]:
plt.figure(figsize=(12, 8))
sns.histplot(df.to_pandas(), x='price', bins=30, kde=True)
plt.title('Distribution of Device Prices')
plt.xlabel('Price ($)')
plt.ylabel('Frequency')
plt.show()

price_mean = df.select(pl.col('price').mean()).item()
price_median = df.select(pl.col('price').median()).item()

print(f"Mean Price: ${price_mean:.2f}")
print(f"Median Price: ${price_median:.2f}")
print(
    f"\nThe distribution is right-skewed because the mean (${price_mean:.2f}) is greater than the median (${price_median:.2f}).")
print(
    "This indicates that there are some expensive devices pulling the average up, with the tail extending to the right.")

In [None]:
top_5_brand = df.select(
    pl.col('brand')
).group_by(
    'brand'
).len().sort(
    by='len', descending=True
).head(5)

top_5_brand

In [None]:
fig = px.bar(
    top_5_brand,
    x='brand',
    y='len'
)

fig.update_layout(
    title='Top 5 Brands by Number of Devices',
    width=800,
    height=600
)

fig

In [None]:
os_counting = df.select(
    pl.col('os')
).group_by(
    'os'
).len().sort(
    by='len', descending=True
)
os_counting

In [None]:
fig = px.bar(
    data_frame=os_counting
,
    x='os',
    y='len'
)

fig.update_layout(
    width=800,
    height=600,
)

fig.show()

In [None]:
ram_price = df.select(
    pl.col('ram_gb', 'price')
)

ram_price.corr()

In [None]:
plt.figure(figsize=(12, 8))
plt.scatter(
    ram_price['ram_gb'],  # X-axis: RAM (GB)
    ram_price['price'],  # Y-axis: Price ($)
    c=ram_price['ram_gb'],
    cmap='viridis',
    alpha=0.8,
    edgecolors='none'
)
plt.title('RAM (GB) vs Price ($)')
plt.xlabel('RAM (GB)')
plt.ylabel('Price ($)')
plt.grid(False)

plt.show()


In [None]:
df.select(
    pl.col('device_type', 'price')
).filter(
    (pl.col('device_type') == 'Desktop') |
    (pl.col('device_type') == 'Laptop')
).group_by(
    pl.col('device_type')
).agg(
    pl.mean('price').alias('price_mean'),
    pl.median('price').alias('price_median'),
    pl.sum('price').alias('price_sum'),
)

In [None]:
cpu_tier_price = df.select(
    pl.col('cpu_tier', 'price')
).group_by(
    'cpu_tier'
).agg(
    pl.mean('price').alias('price_mean'),
)

cpu_tier_price

In [None]:
fig = px.bar(
    data_frame=cpu_tier_price,
    x='cpu_tier',
    y='price_mean'
)

fig.update_layout(
    width=800,
    height=600,
)

fig.show()

In [None]:
corr_between_cou_ghz_price = df.select(
    pl.col('cpu_cores', 'cpu_boost_ghz', 'price')
)

In [None]:
fig = px.scatter(
    data_frame=corr_between_cou_ghz_price,
    x='cpu_cores',
    y='cpu_boost_ghz',
)

fig.update_layout(
    width=800,
    height=600,
    title='Correlation between CPU Cores',
)

fig.show()

In [None]:
fig = px.scatter(
    data_frame=df.to_pandas(),
    x='cpu_cores',
    y='cpu_boost_ghz',
    size='price',
    color='price',
    color_continuous_scale='Viridis',
    title='Relationship between CPU Cores, CPU Boost GHz, and Price',
    labels={
        'cpu_cores': 'CPU Cores',
        'cpu_boost_ghz': 'CPU Boost (GHz)',
        'price': 'Price ($)'
    },
    hover_data=['price']

)

fig.update_layout(
    width=1000,
    height=700
)

fig.show()


In [None]:
fig = px.scatter(
    data_frame=corr_between_cou_ghz_price.to_pandas(),
    x='cpu_cores',
    y='cpu_boost_ghz',
    size='price',
    color='price',
    color_continuous_scale='Viridis',
    title='Relationship between CPU Cores, CPU Boost GHz, and Price',
    labels=[{
        'cpu_cores': 'CPU Cores',
        'cpu_boost_ghz': 'CPU Boost (GHz)',
        'price': 'Price ($)'
    }]
)
fig.update_layout(
    width=1000,
    height=700
)


fig.show()

In [None]:
df.select(
    pl.col('storage_type', 'price_per_gb_storage')
).group_by(
    'storage_type'
).agg(
    pl.col('price_per_gb_storage').mean().alias('avg_price_per_gb'),
    pl.col('price_per_gb_storage').median().alias('median_price_per_gb'),
    pl.sum('price_per_gb_storage').alias('total_price_per_gb')
)

In [None]:
df.select(
    pl.col('storage_type', 'price_per_gb_storage')
).group_by(
    'storage_type'
).agg(
    pl.col('price_per_gb_storage').mean().alias('avg_price_per_gb')
).sort(
    'avg_price_per_gb'
)


In [None]:
df.filter(
    (pl.col('gpu_tier') >= 4) &
    (pl.col('price').is_between(2000, 3000))
).select(
    pl.mean('ram_gb').alias('ram_gb_mean'),
    pl.mean('vram_gb').alias('vram_gb_mean'),
    pl.mean('cpu_cores').alias('cpu_cores_mean'),
    pl.mean('price').alias('price_mean')
)

In [None]:
df.filter(
    (pl.col('gpu_tier') >= 4) &
    (pl.col('price').is_between(2000, 3000))
).select(
    pl.col('ram_gb').mean().alias('avg_ram_gb'),
    pl.col('vram_gb').mean().alias('avg_vram_gb'),
    pl.col('cpu_cores').mean().alias('avg_cpu_cores')
)

In [None]:
df.collect_schema()

In [None]:
df.select(
    pl.col('warranty_months', 'price')
).group_by(
    pl.col('warranty_months')
).agg(
    pl.col('price').mean().alias('avg_prices'),
    pl.col('price').median().alias('median_prices'),
    pl.count('warranty_months').alias('count')
).sort(
    by='warranty_months'
)

In [None]:
# Calculate correlation between warranty_months and price
correlation = df.select(
    pl.corr('warranty_months', 'price').alias('correlation')
).item()

print(f"Correlation between warranty_months and price: {correlation:.4f}")

# Create a scatter plot to visualize the relationship
fig = px.scatter(
    data_frame=df.to_pandas(),
    x='warranty_months',
    y='price',
    trendline='ols',
    title='Warranty Months vs Price',
    labels={
        'warranty_months': 'Warranty (Months)',
        'price': 'Price ($)'
    },
    opacity=0.6
)

fig.update_layout(
    width=1000,
    height=600
)

fig.show()

# Interpret the results
if correlation > 0.3:
    print(f"\n✓ Hypothesis SUPPORTED: Strong positive correlation ({correlation:.4f})")
    print("Devices with longer warranties tend to be more expensive.")
elif correlation > 0.1:
    print(f"\n~ Hypothesis PARTIALLY SUPPORTED: Weak positive correlation ({correlation:.4f})")
    print("There is a slight tendency for longer warranties to be associated with higher prices.")
elif correlation > -0.1:
    print(f"\n✗ Hypothesis DISPROVED: No significant correlation ({correlation:.4f})")
    print("Warranty length has little to no relationship with price.")
else:
    print(f"\n✗ Hypothesis DISPROVED: Negative correlation ({correlation:.4f})")
    print("Devices with longer warranties tend to be less expensive.")



## Summary of Task
Identify price outliers within each device_type group using the IQR method and create visualizations to show these outliers. First create a box plot grouped by device_type, then programmatically identify specific models that exceed the upper outlier threshold (Q3 + 1.5*IQR).



In [None]:
# Create box plot to visualize price outliers by device_type
fig = px.box(
    data_frame=df.to_pandas(),
    x='device_type',
    y='price',
    title='Price Distribution by Device Type (with Outliers)',
    labels={
        'device_type': 'Device Type',
        'price': 'Price ($)'
    },
    points='outliers'
)

fig.update_layout(
    width=1000,
    height=600
)

fig.show()

# Identify outliers programmatically using IQR method
outliers = df.group_by('device_type').agg([
    pl.col('price').quantile(0.25).alias('Q1'),
    pl.col('price').quantile(0.75).alias('Q3')
]).with_columns(
    (pl.col('Q3') - pl.col('Q1')).alias('IQR')
).with_columns(
    (pl.col('Q3') + 1.5 * pl.col('IQR')).alias('upper_limit')
)

# Join back to original dataframe and filter for outliers
outlier_devices = df.join(
    outliers,
    on='device_type',
    how='left'
).filter(
    pl.col('price') > pl.col('upper_limit')
).select(
    pl.col('device_type', 'brand', 'model', 'price', 'upper_limit')
).sort(
    by=['device_type', 'price'],
    descending=[False, True]
)

print(f"\nTotal number of outliers found: {outlier_devices.height}")
print("\nTop 10 most expensive outliers:")
outlier_devices.head(10)

In [None]:
outliers = df.group_by('device_type').agg([
    pl.col('price').quantile(0.25).alias('Q1'),
    pl.col('price').quantile(0.75).alias('Q3')
]).with_columns(
    (pl.col('Q3') - pl.col('Q1')).alias('IQR')
).with_columns(
    (pl.col('Q3') + 1.5 * pl.col('IQR')).alias('upper_limit')
)

outliers

In [None]:
df.join(
    outliers,
    on='device_type',
    how='left'
).filter(
    pl.col('price') > pl.col('upper_limit')
).select(
    pl.col('device_type', 'brand', 'model', 'price', 'upper_limit')
).sort(
    by=['device_type', 'price'],
    descending=[False, True]
)

In [76]:
df.collect_schema()

Schema([('device_type', String),
        ('brand', String),
        ('model', String),
        ('release_year', Int64),
        ('os', String),
        ('form_factor', String),
        ('cpu_brand', String),
        ('cpu_model', String),
        ('cpu_tier', Int64),
        ('cpu_cores', Int64),
        ('cpu_threads', Int64),
        ('cpu_base_ghz', Float64),
        ('cpu_boost_ghz', Float64),
        ('gpu_brand', String),
        ('gpu_model', String),
        ('gpu_tier', Int64),
        ('vram_gb', Int64),
        ('ram_gb', Int64),
        ('storage_type', String),
        ('storage_gb', Int64),
        ('storage_drive_count', Int64),
        ('display_type', String),
        ('display_size_in', Float64),
        ('resolution', String),
        ('refresh_hz', Int64),
        ('battery_wh', Int64),
        ('charger_watts', Int64),
        ('psu_watts', Int64),
        ('wifi', String),
        ('bluetooth', Float64),
        ('weight_kg', Float64),
        ('warranty_months', 

In [80]:
df.filter(
    (pl.col('weight_kg') < 1.5) &
    (pl.col('cpu_tier') >= 4) &
    (pl.col('device_type') == 'Laptop')
)

device_type,brand,model,release_year,os,form_factor,cpu_brand,cpu_model,cpu_tier,cpu_cores,cpu_threads,cpu_base_ghz,cpu_boost_ghz,gpu_brand,gpu_model,gpu_tier,vram_gb,ram_gb,storage_type,storage_gb,storage_drive_count,display_type,display_size_in,resolution,refresh_hz,battery_wh,charger_watts,psu_watts,wifi,bluetooth,weight_kg,warranty_months,price,resolution_width,resolution_height,device_age,cpu_series,price_per_gb_storage
str,str,str,i64,str,str,str,str,i64,i64,i64,f64,f64,str,str,i64,i64,i64,str,i64,i64,str,f64,str,i64,i64,i64,i64,str,f64,f64,i64,f64,i16,i16,i64,str,f64
"""Laptop""","""Dell""","""Dell Creator GIQ""",2024,"""Windows""","""Mainstream""","""Intel""","""Intel i9-14473""",6,26,52,3.0,4.1,"""NVIDIA""","""RTX 40 90""",6,16,128,"""SSD""",1024,1,"""LED""",14.0,"""2560x1600""",60,80,240,,"""Wi-Fi 5""",5.0,1.17,48,2953.99,2560,1600,1,"""i9""",2.88
"""Laptop""","""Dell""","""Dell Nitro 614""",2024,"""Windows""","""Ultrabook""","""AMD""","""AMD Ryzen 7 5302""",4,14,28,2.6,3.6,"""NVIDIA""","""RTX 30 70""",3,8,32,"""SSD""",1024,1,"""LED""",15.6,"""2560x1600""",60,70,180,,"""Wi-Fi 6""",5.1,1.1,24,2109.99,2560,1600,1,"""Ryzen 7""",2.06
"""Laptop""","""Apple""","""Apple Blade 0BN""",2023,"""Windows""","""Mainstream""","""Apple""","""Apple M3 Pro""",5,18,18,2.8,3.6,"""Apple""","""Apple Integrated""",6,6,96,"""NVMe""",1024,1,"""QLED""",14.0,"""1920x1080""",60,60,45,,"""Wi-Fi 6E""",5.1,1.17,24,3421.99,1920,1080,2,"""M3""",3.34
"""Laptop""","""Dell""","""Dell Slim NHG""",2024,"""Windows""","""Mainstream""","""Intel""","""Intel i7-12360""",5,18,36,2.8,3.7,"""AMD""","""RX 7000 80 XT""",5,12,96,"""Hybrid""",1024,1,"""OLED""",17.3,"""1920x1080""",90,70,120,,"""Wi-Fi 6""",5.0,1.44,24,2560.99,1920,1080,1,"""i7""",2.5
"""Laptop""","""Apple""","""Apple Think YDD""",2020,"""Windows""","""Gaming""","""Apple""","""Apple M2""",5,16,16,2.8,3.7,"""Apple""","""Apple Integrated""",5,2,96,"""NVMe""",1024,1,"""OLED""",14.0,"""1920x1080""",120,70,90,,"""Wi-Fi 6E""",5.0,1.17,24,3163.99,1920,1080,5,"""M2""",3.09
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Laptop""","""MSI""","""MSI Air 2F0""",2020,"""Windows""","""Mainstream""","""Intel""","""Intel i7-10761""",4,14,28,2.6,3.7,"""NVIDIA""","""RTX 30 60""",2,6,40,"""NVMe""",1024,1,"""IPS""",16.0,"""1920x1080""",120,70,120,,"""Wi-Fi 6E""",5.0,1.33,12,1736.99,1920,1080,5,"""i7""",1.7
"""Laptop""","""Dell""","""Dell Creator S3L""",2024,"""Windows""","""Ultrabook""","""AMD""","""AMD Ryzen 7 4871""",4,14,28,2.6,3.6,"""AMD""","""RX 6000 80""",4,10,64,"""SSD""",256,1,"""IPS""",15.6,"""3840x2160""",60,50,120,,"""Wi-Fi 6""",5.2,1.1,12,2513.99,3840,2160,1,"""Ryzen 7""",9.82
"""Laptop""","""Apple""","""Apple Slim C5Y""",2023,"""Windows""","""Gaming""","""Apple""","""Apple M1""",4,12,12,2.6,3.5,"""Apple""","""Apple Integrated""",5,6,64,"""NVMe""",2048,1,"""LED""",14.0,"""1920x1080""",60,60,65,,"""Wi-Fi 6E""",5.3,0.97,24,2630.99,1920,1080,2,"""M1""",1.28
"""Laptop""","""HP""","""HP Legion ZCB""",2021,"""Windows""","""Mainstream""","""Intel""","""Intel i7-12249""",5,18,36,2.8,3.9,"""NVIDIA""","""RTX 40 80""",4,10,72,"""NVMe""",512,1,"""LED""",16.0,"""1920x1080""",144,60,65,,"""Wi-Fi 5""",5.0,1.33,36,2397.99,1920,1080,4,"""i7""",4.68


In [81]:
df.with_columns(
    (pl.col('cpu_tier') * 0.5 + pl.col('gpu_tier') * 0.3 + (pl.col('ram_gb') / 32) * 0.2).alias('performance_score')
)

device_type,brand,model,release_year,os,form_factor,cpu_brand,cpu_model,cpu_tier,cpu_cores,cpu_threads,cpu_base_ghz,cpu_boost_ghz,gpu_brand,gpu_model,gpu_tier,vram_gb,ram_gb,storage_type,storage_gb,storage_drive_count,display_type,display_size_in,resolution,refresh_hz,battery_wh,charger_watts,psu_watts,wifi,bluetooth,weight_kg,warranty_months,price,resolution_width,resolution_height,device_age,cpu_series,price_per_gb_storage,performance_score
str,str,str,i64,str,str,str,str,i64,i64,i64,f64,f64,str,str,i64,i64,i64,str,i64,i64,str,f64,str,i64,i64,i64,i64,str,f64,f64,i64,f64,i16,i16,i64,str,f64,f64
"""Desktop""","""Samsung""","""Samsung Forge XDI""",2022,"""Windows""","""ATX""","""Intel""","""Intel i5-11129""",3,12,24,2.8,3.8,"""NVIDIA""","""RTX 40 60""",2,6,16,"""NVMe""",1024,1,"""LED""",27.0,"""2560x1440""",90,,0,750,"""Wi-Fi 6""",5.1,11.0,36,1383.99,2560,1440,3,"""i5""",1.35,2.2
"""Laptop""","""Samsung""","""Samsung Pro KM8""",2022,"""Windows""","""Mainstream""","""Intel""","""Intel i7-11114""",4,12,24,2.6,3.6,"""NVIDIA""","""RTX 40 80""",4,10,64,"""NVMe""",512,1,"""OLED""",16.0,"""1920x1080""",90,56,120,,"""Wi-Fi 6""",5.3,2.03,12,2274.99,1920,1080,3,"""i7""",4.44,3.6
"""Desktop""","""Lenovo""","""Lenovo Strix BIE""",2024,"""macOS""","""SFF""","""AMD""","""AMD Ryzen 5 5168""",2,8,16,2.6,3.6,"""NVIDIA""","""RTX 40 50""",1,4,8,"""NVMe""",512,2,"""LED""",32.0,"""3440x1440""",120,,0,850,"""Wi-Fi 6""",5.0,7.0,24,1879.99,3440,1440,1,"""Ryzen 5""",3.67,1.35
"""Desktop""","""Dell""","""Dell Cube AXR""",2024,"""Windows""","""ATX""","""AMD""","""AMD Ryzen 5 7550""",2,6,12,2.6,3.6,"""AMD""","""RX 7000 60""",2,6,16,"""HDD""",512,2,"""IPS""",27.0,"""3440x1440""",120,,0,650,"""Wi-Fi 6""",5.2,6.0,36,1331.99,3440,1440,1,"""Ryzen 5""",2.6,1.7
"""Laptop""","""Gigabyte""","""Gigabyte Pro IX1""",2024,"""Linux""","""Gaming""","""AMD""","""AMD Ryzen 7 6230""",5,16,32,2.8,3.9,"""NVIDIA""","""RTX 30 80 Ti""",5,12,96,"""NVMe""",256,1,"""Mini-LED""",15.6,"""2560x1600""",90,80,90,,"""Wi-Fi 6""",5.2,1.5,12,2681.99,2560,1600,1,"""Ryzen 7""",10.48,4.6
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""Laptop""","""ASUS""","""ASUS Pro ZWL""",2023,"""Windows""","""Mainstream""","""Intel""","""Intel i7-13721""",4,12,24,2.6,3.7,"""AMD""","""RX 7000 70""",3,8,32,"""HDD""",2048,1,"""LED""",14.0,"""1920x1080""",144,90,180,,"""Wi-Fi 6""",5.1,1.87,24,1712.99,1920,1080,2,"""i7""",0.84,3.1
"""Laptop""","""Lenovo""","""Lenovo Stealth 014""",2018,"""Windows""","""Ultrabook""","""AMD""","""AMD Ryzen 5 5117""",2,8,16,2.2,3.2,"""AMD""","""RX 6000 50""",1,4,8,"""HDD""",1024,1,"""LED""",14.0,"""2560x1600""",90,50,65,,"""Wi-Fi 6""",5.1,1.37,12,1258.99,2560,1600,7,"""Ryzen 5""",1.23,1.35
"""Laptop""","""ASUS""","""ASUS Zen LKD""",2020,"""Windows""","""Mainstream""","""Intel""","""Intel i5-12677""",2,6,10,2.2,3.2,"""NVIDIA""","""RTX 20 60""",2,6,16,"""NVMe""",1024,1,"""OLED""",14.0,"""2560x1600""",120,99,180,,"""Wi-Fi 6""",4.2,1.17,12,1686.99,2560,1600,5,"""i5""",1.65,1.7
"""Laptop""","""ASUS""","""ASUS Blade DH6""",2020,"""Windows""","""Mainstream""","""AMD""","""AMD Ryzen 7 4590""",4,12,24,2.6,3.5,"""NVIDIA""","""RTX 30 70""",3,8,32,"""NVMe""",256,1,"""OLED""",15.6,"""2560x1600""",120,60,90,,"""Wi-Fi 6""",5.3,1.7,24,2164.99,2560,1600,5,"""Ryzen 7""",8.46,3.1
