In [1]:
#Look for the missing values in all the columns and either impute them (replace with mean,median, or mode) or drop them. Justify your action for this task.

import pandas as pd
import datetime
import re

# Load the dataset
data = pd.read_csv('train.csv')

def extract_numeric_value(text):
    if pd.notna(text):
        matches = re.findall(r'(\d+\.\d+|\d+)', str(text))
        if matches:
            return float(matches[0])
    return None

# Check for missing values in each column
missing_values = data.isnull().sum()

# You can choose to drop columns with a significant number of missing values or impute them.
# For example, if "New_Price" has too many missing values, you can drop it.
data.drop(columns=["New_Price"], inplace=True)

# For other columns with missing values, impute them with the mean or median
data['Mileage'] = data['Mileage'].str.extract('(\d+\.\d+)').astype(float)  # Extract and convert to float
data['Engine'] = data['Engine'].str.extract('(\d+)').astype(float)  # Extract and convert to float
data['Power'] = data['Power'].str.extract('(\d+\.\d+)').astype(float)  # Extract and convert to float


# Impute missing values with the mean or median
data['Mileage'].fillna(data['Mileage'].median(), inplace=True)
data['Engine'].fillna(data['Engine'].median(), inplace=True)
data['Power'].fillna(data['Power'].median(), inplace=True)
data["Seats"].fillna(data["Seats"].median(), inplace=True)

#Task
#Remove the units from some of the attributes and only keep the numerical values (for example remove kmpl from “Mileage”, CC from “Engine”, bhp from “Power”, and lakh from “New_price”)

data["Mileage"] = data["Mileage"].apply(extract_numeric_value)
data["Engine"] = data["Engine"].apply(extract_numeric_value)
data["Power"] = data["Power"].apply(extract_numeric_value)


# Change the categorical variables (“Fuel_Type” and “Transmission”) into numerical one hot encoded value

data = pd.get_dummies(data, columns=["Fuel_Type", "Transmission"])


#Create one more feature and add this column to the dataset (you can use mutate function in R for this). For example, you can calculate the current age of the car by subtracting “Year” value from the current year. 

current_year = datetime.datetime.now().year
data['Current_Age'] = current_year - data['Year']

# Display the modified dataset
print(data.head())

# Save the modified dataset to a new CSV file if needed
data.to_csv("modified_used_cars_data.csv", index=False)

   Unnamed: 0                              Name    Location  Year  \
0           1  Hyundai Creta 1.6 CRDi SX Option        Pune  2015   
1           2                      Honda Jazz V     Chennai  2011   
2           3                 Maruti Ertiga VDI     Chennai  2012   
3           4   Audi A4 New 2.0 TDI Multitronic  Coimbatore  2013   
4           6            Nissan Micra Diesel XV      Jaipur  2013   

   Kilometers_Driven Owner_Type  Mileage  Engine   Power  Seats  Price  \
0              41000      First    19.67  1582.0  126.20    5.0  12.50   
1              46000      First    18.19  1199.0   88.70    5.0   4.50   
2              87000      First    20.77  1248.0   88.76    7.0   6.00   
3              40670     Second    15.20  1968.0  140.80    5.0  17.74   
4              86999      First    23.08  1461.0   63.10    5.0   3.50   

   Fuel_Type_Diesel  Fuel_Type_Electric  Fuel_Type_Petrol  \
0                 1                   0                 0   
1                 