In [1]:
import pandas as pd
from scipy import stats

# Read the dataset
df = pd.read_csv("data/Toyota.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
2,2,13950,24.0,41711,Diesel,90,,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170


In [3]:
#a. Remove missing values
df_cleaned = df.dropna()  # Removes rows with any missing values
df_cleaned  # Display the cleaned data

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
5,5,12950,32.0,61000,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1425,1425,7950,80.0,??,Petrol,86,1.0,0,1300,4,1000
1429,1429,8950,78.0,24000,Petrol,86,1.0,1,1300,5,1065
1430,1430,8450,80.0,23000,Petrol,86,0.0,0,1300,3,1015
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015


In [4]:
# b. Display datatypes and concise summary of all numeric variables
df_dtypes = df_cleaned.dtypes  # Data types of all columns
df_summary = df_cleaned.describe()  # Summary statistics for numeric columns
df_dtypes  # Display data types
df_summary  # Display summary statistics

Unnamed: 0.1,Unnamed: 0,Price,Age,MetColor,Automatic,CC,Weight
count,1111.0,1111.0,1111.0,1111.0,1111.0,1111.0,1111.0
mean,707.868587,10774.9973,55.585959,0.675068,0.053105,1569.463546,1073.870387
std,414.02375,3699.695538,18.829188,0.468561,0.224344,185.935691,52.610513
min,0.0,4350.0,1.0,0.0,0.0,1300.0,1000.0
25%,347.5,8450.0,43.0,0.0,0.0,1400.0,1045.0
50%,703.0,9900.0,60.0,1.0,0.0,1600.0,1070.0
75%,1069.5,11950.0,70.0,1.0,0.0,1600.0,1090.0
max,1435.0,31275.0,80.0,1.0,1.0,2000.0,1615.0


In [5]:
# c. Remove all duplicate records
df_no_duplicates = df_cleaned.drop_duplicates()  # Removes all duplicate rows
df_no_duplicates  # Display data without duplicates

Unnamed: 0.1,Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,0,13500,23.0,46986,Diesel,90,1.0,0,2000,three,1165
1,1,13750,23.0,72937,Diesel,90,1.0,0,2000,3,1165
3,3,14950,26.0,48000,Diesel,90,0.0,0,2000,3,1165
4,4,13750,30.0,38500,Diesel,90,0.0,0,2000,3,1170
5,5,12950,32.0,61000,Diesel,90,0.0,0,2000,3,1170
...,...,...,...,...,...,...,...,...,...,...,...
1425,1425,7950,80.0,??,Petrol,86,1.0,0,1300,4,1000
1429,1429,8950,78.0,24000,Petrol,86,1.0,1,1300,5,1065
1430,1430,8450,80.0,23000,Petrol,86,0.0,0,1300,3,1015
1432,1432,10845,72.0,??,Petrol,86,0.0,0,1300,3,1015


In [6]:
# d. Apply Z-score Normalization on Price column
df_no_duplicates['Price_zscore'] = stats.zscore(df_no_duplicates['Price'])  # Z-score normalization
df_no_duplicates[['Price', 'Price_zscore']]  # Display Price and Z-score normalized Price

Unnamed: 0,Price,Price_zscore
0,13500,0.736880
1,13750,0.804483
3,14950,1.128980
4,13750,0.804483
5,12950,0.588152
...,...,...
1425,7950,-0.763919
1429,8950,-0.493505
1430,8450,-0.628712
1432,10845,0.018930


In [7]:
# e. Shape and reshape using pivot_table
df_pivoted = df_no_duplicates.pivot_table(values='Price', index='FuelType', columns='Age', aggfunc='mean')
df_pivoted  # Display the pivot table

Age,1.0,2.0,4.0,6.0,7.0,8.0,9.0,10.0,11.0,12.0,...,71.0,72.0,73.0,74.0,75.0,76.0,77.0,78.0,79.0,80.0
FuelType,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CNG,,,,,,,,,,,...,6450.0,,,,,,,,5250.0,7460.0
Diesel,,,31137.5,,22950.0,24963.333333,,,23000.0,,...,7050.0,,7266.666667,7087.5,7900.0,7275.0,7625.0,6475.0,7450.0,7825.0
Petrol,18020.0,21125.0,,22500.0,19050.0,20714.285714,18546.666667,18450.0,18625.0,20000.0,...,8026.333333,7792.5,8073.333333,7589.285714,7993.8,8061.111111,8056.37931,7893.939394,8471.315789,7731.023256
