# Insurance Risk Data Analysis Project 🚀
#### To begin with the project, I'll perform data cleaning and exploratory data analysis (EDA) on the provided dataset. This will include checking for data quality issues, visualizing the data, and extracting insights from it.

In [3]:
import numpy as np
import pandas as pd

# Constants
file_path = '../data/imports-85.csv'
columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style',
              'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
                'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower',
                'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

In [8]:
# Load data
data = pd.read_csv(file_path, names=columns)

# Show basic information of the data
df_info = data.info()
df_head = data.head()
print(df_info)
print(df_head)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

### Cleaning the Data

In [9]:
# Indentify null values
missing_values = data.isnull().sum()
print(missing_values)

# Indentify ? , | , - values in the data
data = data.replace('?', np.nan)
data = data.replace('-', np.nan)
data = data.replace('|', np.nan)
data = data.replace(' ', np.nan)
symbols_values = data.isnull().sum()
print(symbols_values)

symboling            0
normalized-losses    0
make                 0
fuel-type            0
aspiration           0
num-of-doors         0
body-style           0
drive-wheels         0
engine-location      0
wheel-base           0
length               0
width                0
height               0
curb-weight          0
engine-type          0
num-of-cylinders     0
engine-size          0
fuel-system          0
bore                 0
stroke               0
compression-ratio    0
horsepower           0
peak-rpm             0
city-mpg             0
highway-mpg          0
price                0
dtype: int64
symboling             0
normalized-losses    41
make                  0
fuel-type             0
aspiration            0
num-of-doors          2
body-style            0
drive-wheels          0
engine-location       0
wheel-base            0
length                0
width                 0
height                0
curb-weight           0
engine-type           0
num-of-cylinders      0
engin

In [None]:
# Clean data

## Replace '?' with NaN
data.replace('?', np.nan, inplace=True)