In [1]:
## Installing library

!pip install klib

In [2]:
## Importing libraries

import pandas as pd
import numpy as np
import klib

In [3]:
## Loading Data

# Data URL from UCI repository
auto_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'

# List of columns
column_names = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
                'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 
                'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 
                'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

# Reading data as pandas dataframe
df = pd.read_csv(auto_data_url, names=column_names)
df = df.replace({'?': np.nan})

In [4]:
## Converting columns to Numeric type
numeric_cols = ['normalized-losses', 'engine-size', 'horsepower', 'peak-rpm', 'bore', 'stroke', 'price']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce', axis=1)

In [5]:
## Getting datafrmae columns information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  164 non-null    float64
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       203 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [6]:
## Getting list of columns in a dataframe
df.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price'],
      dtype='object')

## klib.clean - functions for cleaning datasets

In [7]:
## Cleaning dataframe
## 1. Remove Duplicates
## 2. Remove Empty Rows and Columns
## 3. Remove columns having a single/constant value
## 3. Adjust column data types (to reduce memory of a dataframe)

df_cleaned = klib.data_cleaning(df) # performs datacleaning (drop duplicates & empty rows/cols, adjust dtypes,...)
df_cleaned.info()

Shape of cleaned data: (205, 26)Remaining NAs: 59

Changes:
Dropped rows: 0
     of which 0 duplicates. (Rows: [])
Dropped columns: 0
     of which 0 single valued.     Columns: []
Dropped missing values: 0
Reduced memory by at least: 0.02 MB (-50.0%)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   symboling          205 non-null    int8    
 1   normalized_losses  164 non-null    Float32 
 2   make               205 non-null    string  
 3   fuel_type          205 non-null    category
 4   aspiration         205 non-null    category
 5   num_of_doors       203 non-null    category
 6   body_style         205 non-null    category
 7   drive_wheels       205 non-null    category
 8   engine_location    205 non-null    category
 9   wheel_base         205 non-null    Float32 
 10  length             205 non-null    Float32 
 11  

In [8]:
## If we want, we can separately call the function for standardizing the column names
## By default it can be taken care in 'data_cleaning()' function

df_changed_column_names = klib.clean_column_names(df) # cleans and standardizes column names, also called inside data_cleaning()
df_changed_column_names.columns

Index(['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration',
       'num_of_doors', 'body_style', 'drive_wheels', 'engine_location',
       'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_type',
       'num_of_cylinders', 'engine_size', 'fuel_system', 'bore', 'stroke',
       'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg',
       'highway_mpg', 'price'],
      dtype='object')

In [9]:
## If we want, we can separately call the function for converting the column data type
## By default it can be taken care in 'data_cleaning()' function

df_changed_data_types = klib.convert_datatypes(df) # converts existing to more efficient dtypes, also called inside data_cleaning()
df_changed_data_types.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype   
---  ------             --------------  -----   
 0   symboling          205 non-null    int8    
 1   normalized_losses  164 non-null    Float32 
 2   make               205 non-null    string  
 3   fuel_type          205 non-null    category
 4   aspiration         205 non-null    category
 5   num_of_doors       203 non-null    category
 6   body_style         205 non-null    category
 7   drive_wheels       205 non-null    category
 8   engine_location    205 non-null    category
 9   wheel_base         205 non-null    Float32 
 10  length             205 non-null    Float32 
 11  width              205 non-null    Float32 
 12  height             205 non-null    Float32 
 13  curb_weight        205 non-null    int16   
 14  engine_type        205 non-null    category
 15  num_of_cylinders   205 non-null    category
 16  engine_s

In [10]:
## If we want, we can separately call the function for dropping the missing values
## By default it can be taken care in 'data_cleaning()' function

df_subset_missing = klib.drop_missing(df) # drops missing values, also called in data_cleaning()
df_subset_missing.shape

(205, 26)

In [11]:
## Dropping columns having high ratio of missing values

df_subset_columns = klib.mv_col_handling(df) # drops features with high ratio of missing vals based on informational content
df_subset_columns.shape

(205, 26)

In [12]:
## Identify duplicate subset of columns
df_column_subset_duplicates = klib.pool_duplicate_subsets(df) # pools subset of cols based on duplicates with min. loss of information
df_column_subset_duplicates.head(50)

Unnamed: 0,body_style,curb_weight,city_mpg,highway_mpg,price,pooled_vars
0,convertible,2548,21,27,13495.0,0
1,convertible,2548,21,27,16500.0,0
2,hatchback,2823,19,26,16500.0,2
3,sedan,2337,24,30,13950.0,3
4,sedan,2824,18,22,17450.0,4
5,sedan,2507,19,25,15250.0,5
6,sedan,2844,19,25,17710.0,6
7,wagon,2954,19,25,18920.0,7
8,sedan,3086,17,20,23875.0,8
9,hatchback,3053,16,22,,9
