### import numpy, pandas

In [1]:
import numpy as np
import pandas as pd

### load data

In [2]:
data= pd.read_csv("../DataSets/cars93.csv")

## Step 1: First look at data

### 1. Shape & columns

In [5]:
data.shape

(93, 10)

In [4]:
data.columns

Index(['Manufacturer', 'Model', 'Type', 'Price', 'MPG.city', 'AirBags',
       'Horsepower', 'Passengers', 'Rear.seat.room', 'Luggage.room'],
      dtype='object')

### 2. Sample rows

In [10]:
data.head()

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room
0,Acura,Integra,Small,15.9,25,Driver only,140,5,26.5,11.0
1,Acura,Legend,Midsize,33.9,18,Driver & Passenger,200,5,30.0,15.0
2,Audi,90,Compact,29.1,20,Driver only,172,5,28.0,14.0
3,Audi,100,Midsize,37.7,19,Driver & Passenger,172,6,31.0,17.0
4,BMW,535i,Midsize,30.0,22,Driver only,208,4,27.0,13.0


In [8]:
data.tail()

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room
88,Volkswagen,Eurovan,Van,19.7,17,Driver & Passenger,109,7,34.0,
89,Volkswagen,Passat,Compact,20.0,21,Driver & Passenger,134,5,31.5,14.0
90,Volkswagen,Corrado,Sporty,23.3,18,Driver & Passenger,178,4,26.0,15.0
91,Volvo,240,Compact,22.7,21,Driver only,114,5,29.5,14.0
92,Volvo,850,Midsize,26.7,20,Driver & Passenger,168,5,30.0,15.0


In [9]:
data.sample(5)

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room
48,Lexus,ES300,Midsize,28.0,18,Driver only,185,5,27.5,14.0
30,Ford,Festiva,Small,7.4,31,Driver only,63,4,26.0,12.0
86,Toyota,Previa,Van,22.7,18,Driver only,138,7,35.0,
17,Chevrolet,Caprice,Large,18.8,17,Driver only,170,6,29.5,20.0
82,Suzuki,Swift,Small,8.6,39,,70,4,27.5,10.0


### 3. Data info (MOST IMPORTANT)

In [23]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Manufacturer    93 non-null     object 
 1   Model           93 non-null     object 
 2   Type            93 non-null     object 
 3   Price           93 non-null     float64
 4   MPG.city        93 non-null     int64  
 5   AirBags         93 non-null     object 
 6   Horsepower      93 non-null     int64  
 7   Passengers      93 non-null     int64  
 8   Rear.seat.room  91 non-null     float64
 9   Luggage.room    82 non-null     float64
dtypes: float64(3), int64(3), object(4)
memory usage: 7.4+ KB


## Step 2: Identify missing values (DON’T DROP

### Pandas methods

In [13]:
data.isna().sum()

Manufacturer       0
Model              0
Type               0
Price              0
MPG.city           0
AirBags           14
Horsepower         0
Passengers         0
Rear.seat.room     2
Luggage.room      11
dtype: int64

In [14]:
data.isnull().sum()

Manufacturer       0
Model              0
Type               0
Price              0
MPG.city           0
AirBags           14
Horsepower         0
Passengers         0
Rear.seat.room     2
Luggage.room      11
dtype: int64

### Percentage of missing data

In [15]:
(data.isna().sum() / len(data)) * 100

Manufacturer       0.000000
Model              0.000000
Type               0.000000
Price              0.000000
MPG.city           0.000000
AirBags           15.053763
Horsepower         0.000000
Passengers         0.000000
Rear.seat.room     2.150538
Luggage.room      11.827957
dtype: float64

## Step 3: Handle missing values (WITHOUT removing rows)

In [18]:
pd.crosstab(data['Type'], data['AirBags'])

AirBags,Driver & Passenger,Driver only
Type,Unnamed: 1_level_1,Unnamed: 2_level_1
Compact,3,10
Large,4,7
Midsize,8,13
Small,0,14
Sporty,4,9
Van,3,4


In [24]:
airbag_mode_by_type = (
    data.groupby('Type')['AirBags']
      .agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
)

In [25]:
data['AirBags'] = data['AirBags'].fillna(
    data['Type'].map(airbag_mode_by_type)
)

In [26]:
data['AirBags'] = data['AirBags'].fillna(data['AirBags'].mode()[0])

In [27]:
data['AirBags'].isna().sum()

np.int64(0)

In [39]:
data["Luggage.room"]=data["Luggage.room"].fillna(0)

In [40]:
data

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room
0,Acura,Integra,Small,15.9,25,Driver only,140,5,26.5,11.0
1,Acura,Legend,Midsize,33.9,18,Driver & Passenger,200,5,30.0,15.0
2,Audi,90,Compact,29.1,20,Driver only,172,5,28.0,14.0
3,Audi,100,Midsize,37.7,19,Driver & Passenger,172,6,31.0,17.0
4,BMW,535i,Midsize,30.0,22,Driver only,208,4,27.0,13.0
...,...,...,...,...,...,...,...,...,...,...
88,Volkswagen,Eurovan,Van,19.7,17,Driver & Passenger,109,7,34.0,0.0
89,Volkswagen,Passat,Compact,20.0,21,Driver & Passenger,134,5,31.5,14.0
90,Volkswagen,Corrado,Sporty,23.3,18,Driver & Passenger,178,4,26.0,15.0
91,Volvo,240,Compact,22.7,21,Driver only,114,5,29.5,14.0


In [41]:
data[data["Type"]=="Van"]

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room
15,Chevrolet,Lumina_APV,Van,16.3,18,Driver only,170,7,30.5,0.0
16,Chevrolet,Astro,Van,16.6,15,Driver only,165,8,33.5,0.0
25,Dodge,Caravan,Van,19.0,17,Driver only,142,7,26.5,0.0
35,Ford,Aerostar,Van,19.9,15,Driver only,145,7,30.0,0.0
55,Mazda,MPV,Van,19.1,18,Driver & Passenger,155,7,27.5,0.0
65,Nissan,Quest,Van,19.1,17,Driver only,151,7,27.0,0.0
69,Oldsmobile,Silhouette,Van,19.5,18,Driver & Passenger,170,7,30.5,0.0
86,Toyota,Previa,Van,22.7,18,Driver only,138,7,35.0,0.0
88,Volkswagen,Eurovan,Van,19.7,17,Driver & Passenger,109,7,34.0,0.0


In [43]:
data["Luggage.room"].isna().sum()

np.int64(0)

In [44]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 93 entries, 0 to 92
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Manufacturer    93 non-null     object 
 1   Model           93 non-null     object 
 2   Type            93 non-null     object 
 3   Price           93 non-null     float64
 4   MPG.city        93 non-null     int64  
 5   AirBags         93 non-null     object 
 6   Horsepower      93 non-null     int64  
 7   Passengers      93 non-null     int64  
 8   Rear.seat.room  91 non-null     float64
 9   Luggage.room    93 non-null     float64
dtypes: float64(3), int64(3), object(4)
memory usage: 7.4+ KB


In [47]:
data['Rear.seat.room'].isna().sum()

np.int64(2)

In [49]:
data=data.dropna()

In [50]:
data

Unnamed: 0,Manufacturer,Model,Type,Price,MPG.city,AirBags,Horsepower,Passengers,Rear.seat.room,Luggage.room
0,Acura,Integra,Small,15.9,25,Driver only,140,5,26.5,11.0
1,Acura,Legend,Midsize,33.9,18,Driver & Passenger,200,5,30.0,15.0
2,Audi,90,Compact,29.1,20,Driver only,172,5,28.0,14.0
3,Audi,100,Midsize,37.7,19,Driver & Passenger,172,6,31.0,17.0
4,BMW,535i,Midsize,30.0,22,Driver only,208,4,27.0,13.0
...,...,...,...,...,...,...,...,...,...,...
88,Volkswagen,Eurovan,Van,19.7,17,Driver & Passenger,109,7,34.0,0.0
89,Volkswagen,Passat,Compact,20.0,21,Driver & Passenger,134,5,31.5,14.0
90,Volkswagen,Corrado,Sporty,23.3,18,Driver & Passenger,178,4,26.0,15.0
91,Volvo,240,Compact,22.7,21,Driver only,114,5,29.5,14.0


In [51]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 0 to 92
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Manufacturer    91 non-null     object 
 1   Model           91 non-null     object 
 2   Type            91 non-null     object 
 3   Price           91 non-null     float64
 4   MPG.city        91 non-null     int64  
 5   AirBags         91 non-null     object 
 6   Horsepower      91 non-null     int64  
 7   Passengers      91 non-null     int64  
 8   Rear.seat.room  91 non-null     float64
 9   Luggage.room    91 non-null     float64
dtypes: float64(3), int64(3), object(4)
memory usage: 7.8+ KB


## Step 4: Fix incorrect data types

### Check dtypes

### Pandas methods

In [52]:
data.dtypes

Manufacturer       object
Model              object
Type               object
Price             float64
MPG.city            int64
AirBags            object
Horsepower          int64
Passengers          int64
Rear.seat.room    float64
Luggage.room      float64
dtype: object

In [53]:
data.duplicated().sum()

np.int64(0)