In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy
import seaborn as sns

In [None]:
df = pd.read_csv('cars_data.csv')

#1. Understanding the Dataset


In [None]:
#Checking the shape of the dataset
df.shape

In [None]:
#Viewing the first few rows
df.head()

In [None]:
#Understanding data types
df.dtypes

In [None]:
#Describing the dataset to get summary statistics.
df.describe()

#2. Descriptive analysis - Univariate Analysis

In [None]:
# Convert price to numeric, making sure to coerce errors arising from '?' to NaN
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# Drop NaN values specifically for the price column for this analysis
price_clean = df['price'].dropna()

# Plotting the distribution of car prices
plt.figure(figsize=(10, 6))
sns.histplot(price_clean, kde=True, bins=30)
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Count the number of occurrences of each class in the make column
make_counts = df['make'].value_counts()

# Plotting the bar chart for car makes
plt.figure(figsize=(12, 8))
sns.barplot(y=make_counts.index, x=make_counts.values, orient='h')
plt.title('Frequency of Car Makes')
plt.xlabel('Frequency')
plt.ylabel('Make')
plt.show()

#3. Descriptive analysis - Bivariate analysis

In [None]:
# Convert price and engine-size to numeric, coercing errors due to '?' into NaN
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df['engine-size'] = pd.to_numeric(df['engine-size'], errors='coerce')

# Drop NaN values for bivariate analysis
df_clean = df.dropna(subset=['price', 'engine-size'])

# Plotting the relationship between engine size and price
plt.figure(figsize=(10, 6))
sns.scatterplot(x='engine-size', y='price', data=df_clean)
plt.title('Engine Size vs. Price')
plt.xlabel('Engine Size')
plt.ylabel('Price')
plt.show()

In [None]:
# Convert horsepower and peak-rpm to numeric, coercing errors due to '?' into NaN
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')
df['peak-rpm'] = pd.to_numeric(df['peak-rpm'], errors='coerce')

# Drop NaN values for bivariate analysis
df_clean = df.dropna(subset=['horsepower', 'peak-rpm'])

# Plotting the relationship between horsepower and peak RPM
plt.figure(figsize=(10, 6))
sns.scatterplot(x='horsepower', y='peak-rpm', data=df_clean)
plt.title('Horsepower vs. Peak RPM')
plt.xlabel('Horsepower')
plt.ylabel('Peak RPM')
plt.show()

#4. Descriptive analysis- Multivariate analysis

In [None]:
# Using the clean subset of the data without NaN values
correlation_matrix = df_clean[['horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']].corr()

# Plotting the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Matrix for Selected Car Attributes')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.barplot(x='body-style', y='price', hue='drive-wheels', data=df_clean)
plt.title('Price by Body Style and Drive Wheels')
plt.xlabel('Body Style')
plt.ylabel('Price')
plt.legend(title='Drive Wheels')
plt.show()