In [30]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import seaborn as sb
import requests


In [31]:
print(pd.__version__)


2.3.3


In [32]:
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
r = requests.get(url)

with open("car_fuel_efficiency.csv", "wb") as f:
    f.write(r.content)

In [33]:
df = pd.read_csv("car_fuel_efficiency.csv")
print(df.shape) # shows number of rows and columns
print(df.head()) # shows first 5 rows as an example

(9704, 11)
   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  170            3.0       159.0     3413.433759   
1                  130            5.0        97.0     3149.664934   
2                  170            NaN        78.0     3079.038997   
3                  220            4.0         NaN     2542.392402   
4                  210            1.0       140.0     3460.870990   

   acceleration  model_year  origin fuel_type         drivetrain  num_doors  \
0          17.7        2003  Europe  Gasoline    All-wheel drive        0.0   
1          17.8        2007     USA  Gasoline  Front-wheel drive        0.0   
2          15.1        2018  Europe  Gasoline  Front-wheel drive        0.0   
3          20.2        2009     USA    Diesel    All-wheel drive        2.0   
4          14.4        2009  Europe  Gasoline    All-wheel drive        2.0   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  

In [34]:
fuel_types = df["fuel_type"].unique()
print(fuel_types)

['Gasoline' 'Diesel']


In [35]:
# Check for missing values
missing_counts = df.isnull().sum()
missing_counts

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [36]:
# Check all unique values in the Origin column
origin_values = df["origin"].unique()
print(origin_values)

# Filter only Asian cars
asia_cars = df[df["origin"] == 'Asia']

# Find the maximum fuel efficiency (mpg)
max_mpg_asia = asia_cars["fuel_efficiency_mpg"].max()
print("Maximum fuel efficiency (Asia):", max_mpg_asia)

['Europe' 'USA' 'Asia']
Maximum fuel efficiency (Asia): 23.759122836520497


In [37]:
'''
Q6. Median value of horsepower
1. Find the median value of horsepower column in the dataset.
2. Next, calculate the most frequent value of the same horsepower column.
3. Use fillna method to fill the missing values in horsepower column with the most frequent value from the previous step.
4. Now, calculate the median value of horsepower once again.
5. Has it changed?
- Yes, it increased
- Yes, it decreased
- No
'''

med_horsepower = np.nanmedian(df["horsepower"])
print("The median value of horsepower column", med_horsepower)

most_frequent_hp = df["horsepower"].mode()[0]
print("Most frequent horsepower value:", most_frequent_hp)

df["horsepower"].fillna(most_frequent_hp, inplace=True)
print(df["horsepower"].isnull().sum())

med_horsepower_updated = np.median(df["horsepower"])
print("The updated median value of horsepower column", med_horsepower_updated)


The median value of horsepower column 149.0
Most frequent horsepower value: 152.0
0
The updated median value of horsepower column 152.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["horsepower"].fillna(most_frequent_hp, inplace=True)


In [None]:
'''
Q7. Sum of weights
1. Select all the cars from Asia
2. Select only columns vehicle_weight and model_year
3. Select the first 7 values
4. Get the underlying NumPy array. Let's call it X.
5. Compute matrix-matrix multiplication between the transpose of X and X. To get the transpose, use X.T. Let's call the result XTX.
6. Invert XTX.
7. Create an array y with values [1100, 1300, 800, 900, 1000, 1100, 1200].
8. Multiply the inverse of XTX with the transpose of X, and then multiply the result by y. Call the result w.
9. What's the sum of all the elements of the result?
Note: You just implemented linear regression. We'll talk about it in the next lesson.

0.051
0.51
5.1
51
'''

In [38]:
# selecting all the cars from Asia
asian_cars = df[df["origin"] == "Asia"].copy()

In [39]:
# selecting only columns vehicle_weight and model_year
asian_cars = asia_cars[["vehicle_weight", "model_year"]]

In [41]:
# selecting the first 7 values
asian_cars_head_7 = asian_cars.head(7)
print(asian_cars_head_7)

    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
34     2844.227534        2014
38     3761.994038        2019


In [43]:
# underlying NumPy array X
X = asian_cars_head_7.to_numpy()  # shape (7, 2)
print(X)

[[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]


In [44]:
# XTX = X^T X
XTX = X.T @ X
print(XTX)

[[62248334.33150761 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]


In [45]:
# initialising y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])

In [47]:
# inverting XTX
XTX_inv = np.linalg.inv(XTX)
print(XTX_inv)

[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]


In [49]:
# getting w = (X^T X)^{-1} X^T y
w = XTX_inv @ X.T @ y  # shape (2,)
print(w)

[0.01386421 0.5049067 ]


In [50]:
# final result
w_sum = w.sum()
print("w:", w)
print("Sum of elements of w:", w_sum)

w: [0.01386421 0.5049067 ]
Sum of elements of w: 0.5187709081074006
