In [5]:
import pandas as pd
from pandas import DataFrame
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import requests
from typing import Optional

In [18]:
def download_and_save_data(url:str,save_dir:str, filename:str)-> Optional[str]:
    '''
    Download data from a given URL and save it locally
    Args:
        url: str: URL from where to download the data
        save_dir: str: Directory where to save the data
        filename: str: Name of the file to save the data

    Returns:
        Optional[str]: Full path where the file was saved
    '''
    try:
      os.makedirs(save_dir, exist_ok=True)
      response = requests.get(url)
      response.raise_for_status()
      file_path = os.path.join(save_dir, filename)
      with open(file_path, 'wb') as f:
          f.write(response.content)
      return file_path
    except requests.exceptions.RequestException as e:
      print('Error downloading data')
      return None

In [20]:
url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv'
download_and_save_data(url, '../data', 'laptops.csv')

'../data/laptops.csv'

In [11]:
def load_data_to_data_frame(file_path:str)->Optional[DataFrame]:
  '''
  Load data from a local file path.
  Args:
      - file_path (str): The path to the file to be loaded
  Returns:
      - Optimal[DataFrame]: A panda DataFrame containing the loaded data
  '''
  try:
    df = pd.read_csv(file_path)
    return df
  except:
    print("Error loading data")
    return None

In [23]:
#!wget -P ../data/ https://raw.githubusercontent.com/alexeygrigorev/datasets/master/laptops.csv

In [21]:
df = load_data_to_data_frame('../data/laptops.csv')

In [22]:
df.head()

Unnamed: 0,Laptop,Status,Brand,Model,CPU,RAM,Storage,Storage type,GPU,Screen,Touch,Final Price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [26]:
def standardize_column_names(df:DataFrame)->DataFrame:
    '''
    Standardize the column names of a DataFrame
    Args:
        df (DataFrame): The DataFrame to standardize the column names
    Returns:
        DataFrame: The DataFrame with standardized column names
    '''
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
    return df

In [27]:
df = standardize_column_names(df)
df.head()

Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
0,ASUS ExpertBook B1 B1502CBA-EJ0436X Intel Core...,New,Asus,ExpertBook,Intel Core i5,8,512,SSD,,15.6,No,1009.0
1,Alurin Go Start Intel Celeron N4020/8GB/256GB ...,New,Alurin,Go,Intel Celeron,8,256,SSD,,15.6,No,299.0
2,ASUS ExpertBook B1 B1502CBA-EJ0424X Intel Core...,New,Asus,ExpertBook,Intel Core i3,8,256,SSD,,15.6,No,789.0
3,MSI Katana GF66 12UC-082XES Intel Core i7-1270...,New,MSI,Katana,Intel Core i7,16,1000,SSD,RTX 3050,15.6,No,1199.0
4,HP 15S-FQ5085NS Intel Core i5-1235U/16GB/512GB...,New,HP,15S,Intel Core i5,16,512,SSD,,15.6,No,669.01


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2160 entries, 0 to 2159
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   laptop        2160 non-null   object 
 1   status        2160 non-null   object 
 2   brand         2160 non-null   object 
 3   model         2160 non-null   object 
 4   cpu           2160 non-null   object 
 5   ram           2160 non-null   int64  
 6   storage       2160 non-null   int64  
 7   storage_type  2118 non-null   object 
 8   gpu           789 non-null    object 
 9   screen        2156 non-null   float64
 10  touch         2160 non-null   object 
 11  final_price   2160 non-null   float64
dtypes: float64(2), int64(2), object(8)
memory usage: 202.6+ KB


#### Q1. Pandas version

In [29]:
pd.__version__

'2.2.2'

### Q2. Records count

In [30]:
def shape(df:DataFrame)->tuple:
    '''
    Get the shape of a DataFrame
    Args:
        df (DataFrame): The DataFrame to get the shape
    Returns:
        tuple: A tuple containing the number of rows and columns
    '''
    return df.shape
shape(df)

(2160, 12)

### Q3. Laptop brands
How many laptop brands are presented in the dataset?

In [37]:
def number_unique_brands(df:DataFrame,columns:str)->int:
    '''
    Get the number of unique values in each column of a DataFrame
    Args:
        df (DataFrame): The DataFrame to get the number of unique values
    Returns:
        DataFrame: A DataFrame containing the number of unique values in a column
    '''
    if columns in df.columns:
        return df[columns].nunique()
    else:
      raise ValueError(f'{columns} not in DataFrame columns')
number_unique_brands(df, 'brand')

27

#### Q4. Missing values
How many columns in the dataset have missing values?

In [39]:
def count_missing_values(df:DataFrame)->int:
    '''
    Count the number of missing values in each column of a DataFrame
    Args:
        df (DataFrame): The DataFrame to count the missing values
    Returns:
        int: The number of columns with missing values.
    '''
    missing_value_count = df.isnull().sum()
    count_with_missing = missing_value_count[missing_value_count > 0]
    return len(count_with_missing)
count_missing_values(df)


3

### Q5. Maximum final price
What's the maximum final price of Dell notebooks in the dataset?



In [40]:
def max_price_brand(df:DataFrame, brand:str)->float:
    '''
    Get the maximum price of a given brand
    Args:
        df (DataFrame): The DataFrame to get the maximum price
        brand (str): The brand to get the maximum price
    Returns:
        float: The maximum price of the given brand
    '''
    return df[df['brand'] == brand]['final_price'].max()

In [44]:
print(f'The maximum final price of Dell notebooks is:',max_price_brand(df, 'Dell'))

The maximum final price of Dell notebooks is: 3936.0


### Q6. Median value of Screen
<ol>
<li>Find the median value of Screen column in the dataset.
<li>Next, calculate the most frequent value of the same Screen column.
<li>Use fillna method to fill the missing values in Screen column with the most frequent value from the previous step.
<li>Now, calculate the median value of Screen once again.
</ol>

Has it changed?

Hint: refer to existing mode and median functions to complete the task.

<li>Yes
<li>No


In [36]:
# find the median value of screen column in the datsaset
df['screen'].median()

15.6

In [37]:
# calculate the most frequent value in the screen column
df['screen'].mode()

0    15.6
Name: screen, dtype: float64

In [38]:
# use fillna to fill the missing values in the screen column with the most frequent value
df['screen'] = df['screen'].fillna(df['screen'].mode()[0])

In [42]:
# check if the missing values have been filled
df['screen'].isnull().sum()

0

In [44]:
# calculate the median value of the screen column
df['screen'].median(), df['screen'].mode()

(15.6,
 0    15.6
 Name: screen, dtype: float64)

### Q7. Sum of weights
<ol>
<li>Select all the "Innjoo" laptops from the dataset.
<li>Select only columns RAM, Storage, Screen.
<li>Get the underlying NumPy array. Let's call it X.
<li>Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
<li>Compute the inverse of XTX.
<li>Create an array y with values [1100, 1300, 800, 900, 1000, 1100].
<li>Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
<li>What's the sum of all the elements of the result?
</ol>
<p>Note: You just implemented linear regression. We'll talk about it in the next lesson.

<li>0.43
<li>45.29
<li>45.58
<li>91.30


In [46]:
# Select all the "innjoo" laptops from the dataset
innjoo_laptops = df[df['brand'] == 'Innjoo']
innjoo_laptops


Unnamed: 0,laptop,status,brand,model,cpu,ram,storage,storage_type,gpu,screen,touch,final_price
1478,InnJoo Voom Excellence Intel Celeron N4020/8GB...,New,Innjoo,Voom,Intel Celeron,8,256,SSD,,15.6,No,311.37
1479,InnJoo Voom Excellence Pro Intel Celeron N4020...,New,Innjoo,Voom,Intel Celeron,8,512,SSD,,15.6,No,392.55
1480,Innjoo Voom Intel Celeron N3350/4GB/64GB eMMC/...,New,Innjoo,Voom,Intel Celeron,4,64,eMMC,,14.1,No,251.4
1481,Innjoo Voom Laptop Max Intel Celeron N3350/6GB...,New,Innjoo,Voom,Intel Celeron,6,64,eMMC,,14.1,No,383.61
1482,Innjoo Voom Laptop Pro Intel Celeron N3350/6GB...,New,Innjoo,Voom,Intel Celeron,6,128,SSD,,14.1,No,317.02
1483,Innjoo Voom Pro Intel Celeron N3350/6GB/128GB ...,New,Innjoo,Voom,Intel Celeron,6,128,eMMC,,14.1,No,431.38


In [149]:
# Select only  columns ram,storage, screen
columns_name = ['ram','storage','screen']
df_new = innjoo_laptops[columns_name]
df_new.head()

Unnamed: 0,ram,storage,screen
1478,8,256,15.6
1479,8,512,15.6
1480,4,64,14.1
1481,6,64,14.1
1482,6,128,14.1


In [151]:
# Get the underlying NumPy array. Let's call it X
X = df_new.values

In [165]:
X.shape

(6, 3)

In [152]:
# Compute matrix-matrix multiplication between X and X. Let's call the result XTX
XTX = X.T @ X

In [153]:
from numpy.linalg import inv
#Compute the inverse of XTX.
XTX_inv = inv(XTX)
XTX_inv

array([[ 2.78025381e-01, -1.51791334e-03, -1.00809855e-01],
       [-1.51791334e-03,  1.58286725e-05,  4.48052175e-04],
       [-1.00809855e-01,  4.48052175e-04,  3.87214888e-02]])

In [154]:
# Create an array y with values [1100, 1300, 800, 900, 1000, 1100].
y = np.array([1100, 1300, 800, 900, 1000, 1100])
y.shape

(6,)

In [155]:
# Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
w = (XTX_inv @ X.T) @ y


In [156]:
w

array([45.58076606,  0.42783519, 45.29127938])

In [167]:
# what is the sum of all the values of w?
w_sum =  np.sum(w)
w_sum.round(3)

91.3