In [2]:
import pandas as pd

In [3]:
pd.__version__


'2.3.2'

In [4]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [5]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [6]:
df.count()

engine_displacement    9704
num_cylinders          9222
horsepower             8996
vehicle_weight         9704
acceleration           8774
model_year             9704
origin                 9704
fuel_type              9704
drivetrain             9704
num_doors              9202
fuel_efficiency_mpg    9704
dtype: int64

In [7]:
len(df)

9704

In [8]:
# extract the feul types
unique_fuel_types = df['fuel_type'].unique()
print(unique_fuel_types)

# You can also count how many times each type appears using value_counts()
print("\n--- Counts of Each Fuel Type ---")
print(df['fuel_type'].value_counts())

['Gasoline' 'Diesel']

--- Counts of Each Fuel Type ---
fuel_type
Gasoline    4898
Diesel      4806
Name: count, dtype: int64


In [9]:
# Step 1: Check for Missing Values (Boolean Mask) ---
# The .isna() method returns a Boolean DataFrame of the same shape.
# True means the value is missing (NaN); False means it's present.
missing_mask = df.isna()
print("--- Boolean Mask (True = Missing) ---")
print(missing_mask)
print("\n" + "="*70 + "\n")

# Step 2: Get the Count of Missing Values per Column ---
# We chain .sum() after .isna(). Since True is treated as 1 and False as 0,
# summing down the column (axis=0, which is default) gives the total count of NaNs.
missing_counts_per_column = df.isna().sum()
print("--- Total Missing Values per Column ---")
print(missing_counts_per_column)
print("\n" + "="*70 + "\n")

# Total Missing Values in the Entire DataFrame ---
# By chaining .sum() twice, we sum the missing counts from the previous step.
total_missing_values = df.isna().sum().sum()
print(f"--- Total Missing Values in Entire Dataset: {total_missing_values} ---")
print("\n" + "="*70 + "\n")

# Check if ANY Missing Values Exist (Quick Check) ---
# This returns a single Boolean: True if there is at least one NaN anywhere.
has_any_missing = df.isna().any().any()
print(f"--- Does the DataFrame contain ANY missing values? {has_any_missing} ---")


--- Boolean Mask (True = Missing) ---
      engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                   False          False       False           False   
1                   False          False       False           False   
2                   False           True       False           False   
3                   False          False        True           False   
4                   False          False       False           False   
...                   ...            ...         ...             ...   
9699                False          False       False           False   
9700                False           True       False           False   
9701                False          False       False           False   
9702                False          False       False           False   
9703                False          False       False           False   

      acceleration  model_year  origin  fuel_type  drivetrain  num_doors  \
0            False   

In [10]:
# find the max feul effeciency for cars from esia 

# Filter for Asian Cars ---
# We create a new DataFrame containing only cars where the 'Origin' column is 'Asia'.
asian_cars_df = df[df['origin'] == 'Asia']

# Check for missing values in the fuel_efficiency_mpg column
if df['fuel_efficiency_mpg'].isnull().any():
    print("\nWarning: The 'fuel_efficiency_mpg' column contains missing (NaN) values.")
    # Drop rows with NaN in 'MPG' for accurate max calculation, although .max() typically ignores them.
    df_cleaned = asian_cars_df.dropna(subset=['fuel_efficiency_mpg'])
else:
    df_cleaned = asian_cars_df

# --- Find the Maximum Fuel Efficiency ---
# Calculate the maximum value in the 'MPG' column using the .max() method.
max_mpg = df_cleaned['fuel_efficiency_mpg'].max()

# --- Display the Result ---
print("\n" + "="*50)
print(f"The Maximum Fuel Efficiency (MPG) in the dataset is: {max_mpg:.2f}")
print("="*50)

# Optional: Find the vehicle that achieved this maximum MPG
max_car = df_cleaned[df_cleaned['fuel_efficiency_mpg'] == max_mpg]
print("\nVehicle(s) with the maximum MPG:")
# Reset index for cleaner display, ignoring the old index
print(max_car.reset_index(drop=True))



The Maximum Fuel Efficiency (MPG) in the dataset is: 23.76

Vehicle(s) with the maximum MPG:
   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  330            3.0       136.0     1223.298226   

   acceleration  model_year origin fuel_type         drivetrain  num_doors  \
0           NaN        2001   Asia  Gasoline  Front-wheel drive        1.0   

   fuel_efficiency_mpg  
0            23.759123  


In [11]:
# --- Initial Calculations (Before Imputation) ---

# Check the count of missing values in 'horsepower'
missing_count = df['horsepower'].isna().sum()
print(f"\nTotal missing values in 'horsepower': {missing_count}")

# a. Find the initial median value
initial_median = df['horsepower'].median()
print(f"1. Initial Median Horsepower (before fillna): {initial_median:.2f}")

# b. Calculate the most frequent value (Mode)
# .mode() returns a Series, so we use .iloc[0] to get the first (and most frequent) value.
mode_value = df['horsepower'].mode().iloc[0]
print(f"2. Most Frequent Horsepower (Mode): {mode_value:.2f}")

# --- Impute Missing Values ---

# Create a copy of the DataFrame to avoid the SettingWithCopyWarning
df_filled = df.copy()

# Use fillna to replace missing 'horsepower' values with the mode
df_filled['horsepower'].fillna(mode_value, inplace=True)

# Verification check: Missing values should now be 0
filled_missing_count = df_filled['horsepower'].isna().sum()
print(f"3. Missing values in 'horsepower' after fillna: {filled_missing_count}")


# --- Final Calculation (After Imputation) ---

# c. Find the median value after imputation
new_median = df_filled['horsepower'].median()
print(f"4. New Median Horsepower (after fillna): {new_median:.2f}")

# --- Comparison and Result ---
print("\n" + "="*50)
if new_median > initial_median:
    change = "increased"
elif new_median < initial_median:
    change = "decreased"
else:
    change = "did not change"

print(f"Comparison: Initial Median ({initial_median:.2f}) -> New Median ({new_median:.2f})")
print(f"The median value of horsepower has {change}.")
print("="*50)



Total missing values in 'horsepower': 708
1. Initial Median Horsepower (before fillna): 149.00
2. Most Frequent Horsepower (Mode): 152.00
3. Missing values in 'horsepower' after fillna: 0
4. New Median Horsepower (after fillna): 152.00

Comparison: Initial Median (149.00) -> New Median (152.00)
The median value of horsepower has increased.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_filled['horsepower'].fillna(mode_value, inplace=True)


In [13]:
import numpy as np
# --- First make some Data Filtering and Selection ---

# a. Select all the cars from Asia
asian_cars_df = df[df['origin'] == 'Asia']

# b. Select only columns 'vehicle_weight' and 'model_year'
selected_cols_df = asian_cars_df[['vehicle_weight', 'model_year']]

# c. Select the first 7 values
X_df = selected_cols_df.head(7)
print("\n[STEP 1] Data Matrix X (First 7 Asian cars, weight & year):")
print(X_df)

# d. Get the underlying NumPy array. Let's call it X.
X = X_df.values

# --- Linear Algebra Calculations ---

# e. Compute XTX (Transpose of X multiplied by X)
# X.T @ X performs matrix-matrix multiplication
XTX = X.T @ X
print("\n[STEP 2] XTX Matrix (2x2):")
print(XTX)

# f. Invert XTX
# np.linalg.inv computes the matrix inverse
XTX_inv = np.linalg.inv(XTX)
print("\n[STEP 3] Inverse of XTX:")
print(XTX_inv)

# g. Create an array y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(f"\n[STEP 4] Target Vector y (shape: {y.shape}): {y}")

# h. Compute w = (XTX_inv) @ (X.T) @ y
# This computes the coefficients (w) for the linear regression model
w = XTX_inv @ X.T @ y
print("\n[STEP 5] Coefficient Vector w:")
print(w)

# --- Final Result ---

# i. What's the sum of all the elements of the result?
sum_w = w.sum()

print("\n" + "="*50)
print(f"The Sum of all elements in the coefficient vector (w) is: {sum_w:.3f}")
print("="*50)



[STEP 1] Data Matrix X (First 7 Asian cars, weight & year):
    vehicle_weight  model_year
8      2714.219310        2016
12     2783.868974        2010
14     3582.687368        2007
20     2231.808142        2011
21     2659.431451        2016
34     2844.227534        2014
38     3761.994038        2019

[STEP 2] XTX Matrix (2x2):
[[62248334.33150762 41431216.5073268 ]
 [41431216.5073268  28373339.        ]]

[STEP 3] Inverse of XTX:
[[ 5.71497081e-07 -8.34509443e-07]
 [-8.34509443e-07  1.25380877e-06]]

[STEP 4] Target Vector y (shape: (7,)): [1100 1300  800  900 1000 1100 1200]

[STEP 5] Coefficient Vector w:
[0.01386421 0.5049067 ]

The Sum of all elements in the coefficient vector (w) is: 0.519
