## PHASE 2: DATA PREPROCESSING

### This is the Car Evaluation Dataset from UCI Machine Learning Repository

In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


##### Fetch the dataset from the UCI Machine Learning Repository   
Ensure that you have ucimlrepo installed. If not, install it using the following command:   
```!pip install ucimlrepo```   
Then, fetch the dataset as follows below

In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
car_evaluation = fetch_ucirepo(id=19) 
  
# data (as pandas dataframes) 
X = car_evaluation.data.features 
y = car_evaluation.data.targets 
  
# metadata 
print(car_evaluation.metadata) 
  
# variable information 
print(car_evaluation.variables) 

{'uci_id': 19, 'name': 'Car Evaluation', 'repository_url': 'https://archive.ics.uci.edu/dataset/19/car+evaluation', 'data_url': 'https://archive.ics.uci.edu/static/public/19/data.csv', 'abstract': 'Derived from simple hierarchical decision model, this database may be useful for testing constructive induction and structure discovery methods.', 'area': 'Other', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 1728, 'num_features': 6, 'feature_types': ['Categorical'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5JP48', 'creators': ['Marko Bohanec'], 'intro_paper': {'title': 'Knowledge acquisition and explanation for multi-attribute decision making', 'authors': 'M. Bohanec, V. Rajkovič', 'published_in': '8th Intl Workshop on Expert Systems and their Applications, Avignon, France', 'yea

##### Creating the dataframe

In [3]:
df = pd.DataFrame(X, columns=car_evaluation.data.feature_names)
df_y = pd.DataFrame(y, columns=car_evaluation.data.target_names)
car_df = pd.concat([df, df_y], axis=1)
car_df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


##### Descriptive Statistics

In [4]:
car_df.describe()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
count,1728,1728,1728,1728,1728,1728,1728
unique,4,4,4,3,3,3,4
top,vhigh,vhigh,2,2,small,low,unacc
freq,432,432,432,576,576,576,1210


Check the data types of the columns

In [5]:
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   object
 3   persons   1728 non-null   object
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   class     1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


Check for missing values and outliers

In [6]:
car_df.isna().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
class       0
dtype: int64

In [7]:
feature_names = list(car_df.columns.values[:-1])
for feature in feature_names:
    unique_count =car_df[feature].nunique()
    unique_vals = car_df[feature].unique()
    print("{}: {} values, {}".format(feature, unique_count, unique_vals))

buying: 4 values, ['vhigh' 'high' 'med' 'low']
maint: 4 values, ['vhigh' 'high' 'med' 'low']
doors: 4 values, ['2' '3' '4' '5more']
persons: 3 values, ['2' '4' 'more']
lug_boot: 3 values, ['small' 'med' 'big']
safety: 3 values, ['low' 'med' 'high']


Some graphs to visualize the data

Barplots

In [None]:
car_df['buying'].value_counts().plot(kind='bar')

In [None]:
car_df['maint'].value_counts().plot(kind='bar')

Pie Plots

In [None]:
labels = car_df['doors'].unique()
values = car_df['doors'].value_counts() 
fig, ax = plt.subplots()
ax.pie(values, labels=labels, autopct='%1.1f%%')

In [None]:
labels = car_df['persons'].unique()
values = car_df['persons'].value_counts() 
fig, ax = plt.subplots()
ax.pie(values, labels=labels, autopct='%1.1f%%')

### In this phase:
1. We will create dummy variables for the categorical columns then create a new dataframe with the dummy variables. 
2. We will use ordinality to encode the variables in another dataframe
### Then we will compare the accuracy of both

### 1.Encoding the categorical variable to numeric using dummy variables. They range from 0 to 1

In [12]:
df_encoded = pd.get_dummies(car_df, columns=feature_names, drop_first=True,dtype=float)
df_encoded.tail()

Unnamed: 0,class,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
1723,good,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
1724,vgood,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1725,unacc,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1726,good,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1727,vgood,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


Setting the target variable to range from 0-4

In [13]:
df_encoded['class'], class_uniques = pd.factorize(df_encoded['class'])
df_encoded.tail()

Unnamed: 0,class,buying_low,buying_med,buying_vhigh,maint_low,maint_med,maint_vhigh,doors_3,doors_4,doors_5more,persons_4,persons_more,lug_boot_med,lug_boot_small,safety_low,safety_med
1723,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
1724,2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1725,0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
1726,3,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0
1727,2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
df_encoded['class'].value_counts().plot(kind='bar')

### 2.An alternative way to make the data numerical without using dummy variables. We will use ordinality.

In [15]:
df_alternate = car_df.copy()
df_alternate.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


Using dictionaries to map the categorical variables to numerical values

In [16]:
df_alternate['buying'] = df_alternate['buying'].astype(CategoricalDtype(categories=['vhigh', 'high', 'med', 'low'], ordered=True))
df_alternate['maint'] = df_alternate['maint'].astype(CategoricalDtype(categories=['vhigh', 'high', 'med', 'low'], ordered=True))
df_alternate['doors'] = df_alternate['doors'].astype(CategoricalDtype(categories=['2', '3', '4', '5more'], ordered=True))
df_alternate['persons'] = df_alternate['persons'].astype(CategoricalDtype(categories=['2', '4', 'more'], ordered=True))
df_alternate['lug_boot'] = df_alternate['lug_boot'].astype(CategoricalDtype(categories=['small', 'med', 'big'], ordered=True))
df_alternate['safety'] = df_alternate['safety'].astype(CategoricalDtype(categories=['low', 'med', 'high'], ordered=True))
df_alternate['class'] = df_alternate['class'].astype(CategoricalDtype(categories=['unacc', 'acc', 'good', 'vgood'], ordered=True))

df_alternate['buying'] = df_alternate['buying'].replace({'vhigh': 3, 'high': 2, 'med': 1, 'low': 0})
df_alternate['maint'] = df_alternate['maint'].replace({'vhigh': 3, 'high': 2, 'med': 1, 'low': 0})
df_alternate['doors'] = df_alternate['doors'].replace({'2':0,'3':1,'4':2,'5more': 3})
df_alternate['persons'] = df_alternate['persons'].replace({'more': 2,'4':1,'2':0})
df_alternate['lug_boot'] = df_alternate['lug_boot'].replace({'big': 2, 'med': 1, 'small': 0})
df_alternate['safety'] = df_alternate['safety'].replace({'high': 2, 'med': 1, 'low': 0})
df_alternate['class'] = df_alternate['class'].replace({'vgood': 3, 'good': 2, 'acc': 1, 'unacc': 0})

Change all columns to floats

In [None]:
df_alternate[df_alternate.columns] = df_alternate[df_alternate.columns].astype(float)
print(df_alternate.dtypes)
print(df_alternate.head())

##### Correlation Matrix with dummy variables

In [None]:
correlation_matrix = df_encoded.corr()
correlation_matrix

##### Correlation Matrix with ordinality

In [None]:
correlation_matrix_alternate = df_alternate.corr()
correlation_matrix_alternate

In [None]:
sns.heatmap(correlation_matrix_alternate, annot=True, cmap='Blues', linewidths=2)

##### Heatmap to visualize the correlation between the columns

In [None]:
plt.figure(figsize=(18, 14))
sns.heatmap(df_encoded.corr(),cmap='Blues', annot=True)
plt.show()

Boxplots to visualize the distribution of the data

In [None]:

ax = df_encoded[['buying_low','buying_med','buying_vhigh']].plot(kind='box', title='Boxplot of features')
plt.show()

In [None]:
ax = df_encoded[['maint_low','maint_med','maint_vhigh']].plot(kind='box', title='Boxplot of maintenance features')

In [None]:
ax = df_encoded[['doors_3','doors_4','doors_5more']].plot(kind='box', title='Boxplot of doors features')

In [None]:
ax = df_encoded[['persons_4','persons_more']].plot(kind='box', title='Boxplot of persons features')

In [None]:
ax = df_encoded[['lug_boot_med','lug_boot_small']].plot(kind='box', title='Boxplot of lug_boot features')

In [None]:
ax = df_encoded[['safety_low','safety_med']].plot(kind='box', title='Boxplot of safety features')

In [None]:
plt.hist(df_encoded['class'], bins=20)

### PHASE 3: FEATURE ENGINEERING

##### 1.USING THE DUMMY VARIABLES DATAFRAME

Feature Selection

In [29]:
## Selecting the best 10 features using correlation matrix in relation to the class.
best_features = correlation_matrix['class'].abs()
best_features_i = best_features.nlargest(10).index
best_df = df_encoded[best_features_i]

Feature Transformation of the best features using StandardScaler

In [30]:
scaled_features = StandardScaler().fit_transform(best_df.values)
scaled_features_df = pd.DataFrame(scaled_features, index=best_df.index, columns=best_df.columns)

Box cox normalization of the best features using PowerTransformer

In [32]:
data = best_df.drop('class', axis=1)
min_max_scaler = MinMaxScaler(feature_range=(1, 2))
power_transformer = PowerTransformer(method='box-cox', standardize=True)
pipeline = Pipeline(steps=[('s', min_max_scaler),('p', power_transformer)])
data = pipeline.fit_transform(data)
power_df = pd.DataFrame(data, columns=best_df.columns[:-1])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
