In [1]:
## Installing Library

!pip install sweetviz

In [2]:
## Importing libraries

import pandas as pd
import numpy as np
import sweetviz as sv

In [3]:
## Loading Data

# Data URL from UCI repository
auto_data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data'

# List of columns
column_names = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 
                'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 
                'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 
                'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

# Reading data as pandas dataframe
df = pd.read_csv(auto_data_url, names=column_names)
df = df.replace({'?': None})

In [4]:
## Generating EDA report (or Analyzing the dataframe)
my_report = sv.analyze(df)

                                             |                                             | [  0%]   00:00 ->…

In [5]:
## Downloading and showing the generated EDA report as HTML in external tab

my_report.show_html(filepath='EDA_Report.html', 
                    open_browser=True, 
                    layout='widescreen', 
                    scale=None)

Report EDA_Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [6]:
## Displating EDA report in notebook itself

my_report.show_notebook(w=None, 
                        h=None, 
                        scale=None,
                        layout='widescreen',
                        filepath=None)

In [7]:
## Creating a copy of dataframe
df_new = df.copy()

## Replacing None value in 'price' column to -999
df_new['price'] = df_new['price'].replace({None: '-999'})

## Converting price column from string to int
df_new['price'] = df_new['price'].astype(str).astype(int)

## Replacing -999 value in price column to Null
df_new.loc[df_new['price'] == -999, 'price'] = np.nan

## Splitting  the dataframe into 2 dataframes (Train and Test)
df_train = df_new[0:150]
df_test = df_new[150:]

## Imputing missing values in "price" column with mean
df_train['price'] = df_train['price'].fillna(df_train['price'].mean()).astype(int)
df_test['price'] = df_test['price'].fillna(df_test['price'].mean()).astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train['price'] = df_train['price'].fillna(df_train['price'].mean()).astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['price'] = df_test['price'].fillna(df_test['price'].mean()).astype(int)


In [8]:
## Comparing two dataframes (e.g. Test vs Training sets)
comparison_report = sv.compare([df_train,'Train'], [df_test,'Test'], target_feat='price')

                                             |                                             | [  0%]   00:00 ->…

In [9]:
comparison_report.show_notebook()