In [None]:
import pandas as pd

In [None]:
pd.__version__


: 

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

: 

In [None]:
df.head()

: 

In [None]:
df.count()

In [None]:
len(df)

In [None]:
# extract the feul types
unique_fuel_types = df['fuel_type'].unique()
print(unique_fuel_types)

# You can also count how many times each type appears using value_counts()
print("\n--- Counts of Each Fuel Type ---")
print(df['fuel_type'].value_counts())

In [None]:
# Step 1: Check for Missing Values (Boolean Mask) ---
# The .isna() method returns a Boolean DataFrame of the same shape.
# True means the value is missing (NaN); False means it's present.
missing_mask = df.isna()
print("--- Boolean Mask (True = Missing) ---")
print(missing_mask)
print("\n" + "="*70 + "\n")

# Step 2: Get the Count of Missing Values per Column ---
# We chain .sum() after .isna(). Since True is treated as 1 and False as 0,
# summing down the column (axis=0, which is default) gives the total count of NaNs.
missing_counts_per_column = df.isna().sum()
print("--- Total Missing Values per Column ---")
print(missing_counts_per_column)
print("\n" + "="*70 + "\n")

# Total Missing Values in the Entire DataFrame ---
# By chaining .sum() twice, we sum the missing counts from the previous step.
total_missing_values = df.isna().sum().sum()
print(f"--- Total Missing Values in Entire Dataset: {total_missing_values} ---")
print("\n" + "="*70 + "\n")

# Check if ANY Missing Values Exist (Quick Check) ---
# This returns a single Boolean: True if there is at least one NaN anywhere.
has_any_missing = df.isna().any().any()
print(f"--- Does the DataFrame contain ANY missing values? {has_any_missing} ---")


In [None]:
# find the max feul effeciency for cars from esia 

# Filter for Asian Cars ---
# We create a new DataFrame containing only cars where the 'Origin' column is 'Asia'.
asian_cars_df = df[df['origin'] == 'Asia']

# Check for missing values in the fuel_efficiency_mpg column
if df['fuel_efficiency_mpg'].isnull().any():
    print("\nWarning: The 'fuel_efficiency_mpg' column contains missing (NaN) values.")
    # Drop rows with NaN in 'MPG' for accurate max calculation, although .max() typically ignores them.
    df_cleaned = asian_cars_df.dropna(subset=['fuel_efficiency_mpg'])
else:
    df_cleaned = asian_cars_df

# --- Find the Maximum Fuel Efficiency ---
# Calculate the maximum value in the 'MPG' column using the .max() method.
max_mpg = df_cleaned['fuel_efficiency_mpg'].max()

# --- Display the Result ---
print("\n" + "="*50)
print(f"The Maximum Fuel Efficiency (MPG) in the dataset is: {max_mpg:.2f}")
print("="*50)

# Optional: Find the vehicle that achieved this maximum MPG
max_car = df_cleaned[df_cleaned['fuel_efficiency_mpg'] == max_mpg]
print("\nVehicle(s) with the maximum MPG:")
# Reset index for cleaner display, ignoring the old index
print(max_car.reset_index(drop=True))


In [None]:
# --- Initial Calculations (Before Imputation) ---

# Check the count of missing values in 'horsepower'
missing_count = df['horsepower'].isna().sum()
print(f"\nTotal missing values in 'horsepower': {missing_count}")

# a. Find the initial median value
initial_median = df['horsepower'].median()
print(f"1. Initial Median Horsepower (before fillna): {initial_median:.2f}")

# b. Calculate the most frequent value (Mode)
# .mode() returns a Series, so we use .iloc[0] to get the first (and most frequent) value.
mode_value = df['horsepower'].mode().iloc[0]
print(f"2. Most Frequent Horsepower (Mode): {mode_value:.2f}")

# --- Impute Missing Values ---

# Create a copy of the DataFrame to avoid the SettingWithCopyWarning
df_filled = df.copy()

# Use fillna to replace missing 'horsepower' values with the mode
df_filled['horsepower'].fillna(mode_value, inplace=True)

# Verification check: Missing values should now be 0
filled_missing_count = df_filled['horsepower'].isna().sum()
print(f"3. Missing values in 'horsepower' after fillna: {filled_missing_count}")


# --- Final Calculation (After Imputation) ---

# c. Find the median value after imputation
new_median = df_filled['horsepower'].median()
print(f"4. New Median Horsepower (after fillna): {new_median:.2f}")

# --- Comparison and Result ---
print("\n" + "="*50)
if new_median > initial_median:
    change = "increased"
elif new_median < initial_median:
    change = "decreased"
else:
    change = "did not change"

print(f"Comparison: Initial Median ({initial_median:.2f}) -> New Median ({new_median:.2f})")
print(f"The median value of horsepower has {change}.")
print("="*50)


In [None]:
# --- First make some Data Filtering and Selection ---

# a. Select all the cars from Asia
asian_cars_df = df[df['origin'] == 'Asia']

# b. Select only columns 'vehicle_weight' and 'model_year'
selected_cols_df = asian_cars_df[['vehicle_weight', 'model_year']]

# c. Select the first 7 values
X_df = selected_cols_df.head(7)
print("\n[STEP 1] Data Matrix X (First 7 Asian cars, weight & year):")
print(X_df)

# d. Get the underlying NumPy array. Let's call it X.
X = X_df.values

# --- Linear Algebra Calculations ---

# e. Compute XTX (Transpose of X multiplied by X)
# X.T @ X performs matrix-matrix multiplication
XTX = X.T @ X
print("\n[STEP 2] XTX Matrix (2x2):")
print(XTX)

# f. Invert XTX
# np.linalg.inv computes the matrix inverse
XTX_inv = np.linalg.inv(XTX)
print("\n[STEP 3] Inverse of XTX:")
print(XTX_inv)

# g. Create an array y
y = np.array([1100, 1300, 800, 900, 1000, 1100, 1200])
print(f"\n[STEP 4] Target Vector y (shape: {y.shape}): {y}")

# h. Compute w = (XTX_inv) @ (X.T) @ y
# This computes the coefficients (w) for the linear regression model
w = XTX_inv @ X.T @ y
print("\n[STEP 5] Coefficient Vector w:")
print(w)

# --- Final Result ---

# i. What's the sum of all the elements of the result?
sum_w = w.sum()

print("\n" + "="*50)
print(f"The Sum of all elements in the coefficient vector (w) is: {sum_w:.3f}")
print("="*50)
