# Machine Learning Zoomcamp - Intro

In [2]:
!pip install numpy
!pip install pandas



In [3]:
import pandas as pd
import numpy as np

In [4]:
#Q1
print(pd.__version__)

2.3.2


In [5]:
#Q2
df = pd.read_csv("data/car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [6]:
print("Number of records:", len(df))

Number of records: 9704


In [9]:
print("Shape of the dataset:", df.shape)

Shape of the dataset: (9704, 11)


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9704 entries, 0 to 9703
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   engine_displacement  9704 non-null   int64  
 1   num_cylinders        9222 non-null   float64
 2   horsepower           8996 non-null   float64
 3   vehicle_weight       9704 non-null   float64
 4   acceleration         8774 non-null   float64
 5   model_year           9704 non-null   int64  
 6   origin               9704 non-null   object 
 7   fuel_type            9704 non-null   object 
 8   drivetrain           9704 non-null   object 
 9   num_doors            9202 non-null   float64
 10  fuel_efficiency_mpg  9704 non-null   float64
dtypes: float64(6), int64(2), object(3)
memory usage: 834.1+ KB


In [14]:
#Q3
# Unique fuel types
fuel_types_list = df['fuel_type'].unique()
print(f"fuel types : {fuel_types_list}")
print(f"Count of fuel types : {len(fuel_types_list)}")
# Counts of each fuel type
print(df['fuel_type'].value_counts())


fuel types : ['Gasoline' 'Diesel']
Count of fuel types : 2
fuel_type
Gasoline    4898
Diesel      4806
Name: count, dtype: int64


In [16]:
#Q4
# Missing values in each column
print("Missing values per column:")
print(df.isnull().sum())

# Total missing values in the dataset
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

# Number of columns with missing values
missing_columns_count = df.isnull().sum().gt(0).sum()
print(f"Number of columns with missing values: {missing_columns_count}")


Missing values per column:
engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

Total missing values: 2622
Number of columns with missing values: 4


In [17]:
#Q5
# Filter cars from Asia
asia_cars = df[df['origin'] == 'Asia']

# Find maximum fuel efficiency
max_fuel_eff_asia = asia_cars['fuel_efficiency_mpg'].max()

print(f"Maximum fuel efficiency of cars from Asia: {max_fuel_eff_asia}")


Maximum fuel efficiency of cars from Asia: 23.759122836520497


In [19]:
#Q6
# 1. Median value of horsepower before filling missing values
median_before = df['horsepower'].median()
print(f"Median horsepower before filling: {median_before}")

# 2. Most frequent (mode) value of horsepower
most_frequent = df['horsepower'].mode()[0]
print(f"Most frequent horsepower value: {most_frequent}")

# 3. Fill missing values with the most frequent value
df['horsepower'] = df['horsepower'].fillna(most_frequent)

# 4. Median value of horsepower after filling
median_after = df['horsepower'].median()
print(f"Median horsepower after filling: {median_after}")

# 5. Check if median changed
if median_after > median_before:
    print("Yes, it increased")
elif median_after < median_before:
    print("Yes, it decreased")
else:
    print("No")


Median horsepower before filling: 152.0
Most frequent horsepower value: 152.0
Median horsepower after filling: 152.0
No


In [20]:
#Q7
# 1. Select all the cars from Asia
asia_cars = df[df['origin'] == 'Asia']

# 2. Select only vehicle_weight and model_year columns
asia_subset = asia_cars[['vehicle_weight', 'model_year']]

# 3. Select the first 7 values
asia_first7 = asia_subset.iloc[:7]

# 4. Get the underlying NumPy array
X = asia_first7.values
print("X:\n", X)

# 5. Compute XTX = X.T @ X
XTX = X.T @ X
print("\nXTX:\n", XTX)

# 6. Invert XTX
XTX_inv = np.linalg.inv(XTX)
print("\nXTX inverse:\n", XTX_inv)

# 7. Create array y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print("\ny:", y)

# 8. Compute w = XTX_inv @ X.T @ y
w = XTX_inv @ X.T @ y
print("\nw:", w)

# 9. Sum of all elements in w
sum_w = w.sum()
print(f"\nSum of all elements in w: {sum_w}")


X:
 [[2714.21930965 2016.        ]
 [2783.86897424 2010.        ]
 [3582.68736772 2007.        ]
 [2231.8081416  2011.        ]
 [2659.43145076 2016.        ]
 [2844.22753389 2014.        ]
 [3761.99403819 2019.        ]]

XTX:
 [[62248334.33150762 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]

XTX inverse:
 [[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]

y: [1100 1300  800  900 1000 1100 1200]

w: [0.01386421 0.5049067 ]

Sum of all elements in w: 0.5187709081074016
