In [1]:
import pandas as pd

# Read dataset
df = pd.read_csv("data/Toyota.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [2]:
# a. Remove duplicate records from dataset and display concise summary
df_no_duplicates = df.drop_duplicates()  # Remove duplicate records
summary = df_no_duplicates.describe()  # Concise summary of the data
summary

Unnamed: 0.1,Unnamed: 0,Price,Age,MetColor,Automatic,CC,Weight
count,1436.0,1436.0,1336.0,1286.0,1436.0,1436.0,1436.0
mean,717.5,10730.824513,55.672156,0.674961,0.05571,1566.827994,1072.45961
std,414.681806,3626.964585,18.589804,0.468572,0.229441,187.182436,52.64112
min,0.0,4350.0,1.0,0.0,0.0,1300.0,1000.0
25%,358.75,8450.0,43.0,0.0,0.0,1400.0,1040.0
50%,717.5,9900.0,60.0,1.0,0.0,1600.0,1070.0
75%,1076.25,11950.0,70.0,1.0,0.0,1600.0,1085.0
max,1435.0,32500.0,80.0,1.0,1.0,2000.0,1615.0


In [3]:
# b. Create Subset selecting columns 'Price', 'Age', 'FuelType' and initial 10 records
subset = df_no_duplicates[['Price', 'Age', 'FuelType']].head(10)
subset

Unnamed: 0,Price,Age,FuelType
0,13500,23.0,Diesel
1,13750,23.0,Diesel
2,13950,24.0,Diesel
3,14950,26.0,Diesel
4,13750,30.0,Diesel
5,12950,32.0,Diesel
6,16900,27.0,Diesel
7,18600,30.0,
8,21500,27.0,Petrol
9,12950,23.0,Diesel


In [4]:
# c. Transpose of this subset
subset_transposed = subset.T  # Transpose the subset
subset_transposed 


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
Price,13500,13750,13950,14950,13750,12950,16900,18600.0,21500,12950
Age,23.0,23.0,24.0,26.0,30.0,32.0,27.0,30.0,27.0,23.0
FuelType,Diesel,Diesel,Diesel,Diesel,Diesel,Diesel,Diesel,,Petrol,Diesel


In [6]:
# d. Apply mean-max normalization on HP column

# Convert HP column to numeric, setting errors='coerce' to convert invalid parsing to NaN
df_no_duplicates['HP'] = pd.to_numeric(df_no_duplicates['HP'], errors='coerce')

# Handle missing values (NaN) by filling with the mean of the column
df_no_duplicates['HP'].fillna(df_no_duplicates['HP'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_no_duplicates['HP'].fillna(df_no_duplicates['HP'].mean(), inplace=True)


In [7]:
# Apply min-max normalization
max_HP = df_no_duplicates['HP'].max()  # Find maximum value in 'HP' column
min_HP = df_no_duplicates['HP'].min()  # Find minimum value in 'HP' column
df_no_duplicates['HP_normalized'] = (df_no_duplicates['HP'] - min_HP) / (max_HP - min_HP)  # Apply min-max normalization


In [9]:
df_no_duplicates[['HP', 'HP_normalized']].head()

Unnamed: 0,HP,HP_normalized
0,90.0,0.170732
1,90.0,0.170732
2,90.0,0.170732
3,90.0,0.170732
4,90.0,0.170732
