# Question 1 Solution

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime

# Load the dataset
df = pd.read_csv('data/train.csv')
df.head()

## a) Look for the missing values in all the columns and either impute them or drop them. Justify your action.

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

# Calculate percentage of missing values
missing_percent = (df.isnull().sum() / len(df)) * 100
print("\nPercentage of missing values:")
print(missing_percent)

**Justification:**
- `New_Price`: This column has over 86% missing values. Imputing such a large portion of data would introduce significant bias and is not reliable. Therefore, we will drop this column.
- `Mileage`, `Engine`, `Power`, `Seats`: These columns have a very small percentage of missing values (less than 1%). Dropping these few rows will not significantly impact the dataset size or analysis.

In [None]:
# Drop New_Price column
if 'New_Price' in df.columns:
    df = df.drop(columns=['New_Price'])

# Drop rows with missing values for other columns
df = df.dropna()

# Verify missing values are handled
print(df.isnull().sum())

## b) Remove the units from some of the attributes and only keep the numerical values.

In [None]:
# Function to extract numerical values
def clean_currency(x):
    if isinstance(x, str):
        return float(x.split()[0])
    return x

# Clean Mileage
# Splitting by space and taking the first part. Handling NaN just in case.
df['Mileage'] = df['Mileage'].astype(str).apply(lambda x: float(x.split()[0]) if 'km' in x else np.nan)

# Clean Engine
df['Engine'] = df['Engine'].astype(str).apply(lambda x: float(x.split()[0]) if 'CC' in x else np.nan)

# Clean Power
# Power sometimes has 'null bhp'
df['Power'] = df['Power'].astype(str).apply(lambda x: float(x.split()[0]) if 'bhp' in x and x.split()[0] != 'null' else np.nan)

# Drop any rows that became NaN during conversion (e.g. 'null bhp')
df = df.dropna()

print(df[['Mileage', 'Engine', 'Power']].head())
print(df.info())

## c) Change the categorical variables (“Fuel_Type” and “Transmission”) into numerical one hot encoded value.

In [None]:
# One hot encoding for Fuel_Type and Transmission
# Using pd.get_dummies. 
# We convert boolean result to integer (0 and 1) for better readability/compatibility
df = pd.get_dummies(df, columns=['Fuel_Type', 'Transmission'], drop_first=True, dtype=int)

print(df.head())

## d) Create one more feature and add this column to the dataset.

In [None]:
# Calculate Current Age
current_year = datetime.datetime.now().year
df['Current_Age'] = current_year - df['Year']

print(df[['Year', 'Current_Age']].head())

## e) Perform select, filter, rename, mutate, arrange and summarize with group by operations.

In [None]:
# 1. Select specific columns
selected_df = df[['Name', 'Location', 'Current_Age', 'Price']]

# 2. Filter: Cars with Price > 10 Lakh
filtered_df = selected_df[selected_df['Price'] > 10]

# 3. Rename columns
renamed_df = filtered_df.rename(columns={'Name': 'Car_Name', 'Price': 'Price_Lakhs'})

# 4. Mutate: Add a new column (e.g., Price in Thousands)
mutated_df = renamed_df.assign(Price_Thousands = renamed_df['Price_Lakhs'] * 100)

# 5. Arrange: Sort by Price descending
arranged_df = mutated_df.sort_values(by='Price_Lakhs', ascending=False)

# 6. Summarize with Group By: Average Price by Location
summary_df = arranged_df.groupby('Location')['Price_Lakhs'].mean().reset_index()

print("Summary of Average Price by Location (for cars > 10 Lakh):")
print(summary_df)