In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset
df = pd.read_csv('auto-mpg.data', sep=r'\s+', header=None)  # Use sep='\s+' with raw string notation

# Step 2: Review the data
df.head()

# Step 3: Assign column names
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin', 'car_name'
]
df.columns = column_names

# Step 4: Handle missing values in 'horsepower'
df['horsepower'] = df['horsepower'].replace('?', np.nan)  # Replace '?' with NaN
df['horsepower'] = df['horsepower'].astype(float)  # Convert to numeric
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())  # Fill NaN with median

# Step 5: Convert 'origin' codes to country names
df['origin'] = df['origin'].replace({1: 'USA', 2: 'Asia', 3: 'Europe'})

# Handle unexpected values (if any) and missing values in 'origin'
df['origin'] = df['origin'].replace({0: 'Unknown', 4: 'Unknown'})  # Replace invalid origin codes
df['origin'] = df['origin'].fillna('Unknown')  # Replace missing values with 'Unknown'

# Step 6: Bar chart - Distribution of cylinders
plt.figure(figsize=(8, 5))
sns.countplot(x='cylinders', data=df, hue='origin', palette='viridis')
plt.xlabel("Number of Cylinders")
plt.ylabel("Count")
plt.title("Distribution of Cylinders by Origin")
plt.show()

# Step 7: Scatter plot - Relationship between horsepower and weight for each origin country
plt.figure(figsize=(8, 5))
sns.scatterplot(x='horsepower', y='weight', data=df, hue='origin', style='origin', palette='Set1', alpha=0.7)
plt.xlabel("Horsepower")
plt.ylabel("Weight")
plt.title("Horsepower vs Weight by Origin")
plt.grid(True)  # Add gridlines
plt.legend(title='Origin')  # Add legend with title
plt.show()

# Step 8: Answer an interesting question
# Example: How does MPG vary with the number of cylinders?
plt.figure(figsize=(8, 5))
sns.boxplot(x='cylinders', y='mpg', data=df, hue='cylinders', palette='coolwarm')  # Added hue to avoid warning
plt.xlabel("Number of Cylinders")
plt.ylabel("Miles Per Gallon (MPG)")
plt.title("MPG Distribution by Number of Cylinders")
plt.show()


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Step 1: Load the dataset
df = pd.read_csv('auto-mpg.data', sep=r'\s+', header=None)  # Use sep='\s+' with raw string notation

# Step 2: Assign column names
column_names = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
    'acceleration', 'model_year', 'origin', 'car_name'
]
df.columns = column_names

# Step 3: Review the data and show the dataset information
print("Dataset Information:")
df.info()  # Display dataset information

# Step 4: Handle missing values in 'horsepower'
# Replace '?' with NaN and convert to numeric
df['horsepower'] = df['horsepower'].replace('?', np.nan)  # Replace '?' with NaN
df['horsepower'] = pd.to_numeric(df['horsepower'], errors='coerce')  # Convert to numeric (coerce errors to NaN)
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())  # Fill NaN with median

# Verify the data type change after conversion
print("\nData Types After Conversion:")
df.info()  # Verify the data types of the columns

# Step 5: Convert 'origin' codes to country names
df['origin'] = df['origin'].replace({1: 'USA', 2: 'Asia', 3: 'Europe'})

# Handle unexpected values (if any) and missing values in 'origin'
df['origin'] = df['origin'].replace({0: 'Unknown', 4: 'Unknown'})  # Replace invalid origin codes
df['origin'] = df['origin'].fillna('Unknown')  # Replace missing values with 'Unknown'

# Step 6: Bar chart - Distribution of cylinders
plt.figure(figsize=(8, 5))
sns.countplot(x='cylinders', data=df, hue='origin', palette='viridis')
plt.xlabel("Number of Cylinders")
plt.ylabel("Count")
plt.title("Distribution of Cylinders by Origin")
plt.show()

# Step 7: Scatter plot - Relationship between horsepower and weight for each origin country
plt.figure(figsize=(8, 5))
sns.scatterplot(x='horsepower', y='weight', data=df, hue='origin', style='origin', palette='Set1', alpha=0.7)
plt.xlabel("Horsepower")
plt.ylabel("Weight")
plt.title("Horsepower vs Weight by Origin")
plt.grid(True)  # Add gridlines
plt.legend(title='Origin')  # Add legend with title
plt.show()

# Step 8: Answer an interesting question
# Example: How does MPG vary with the number of cylinders?
plt.figure(figsize=(8, 5))
sns.boxplot(x='cylinders', y='mpg', data=df, hue='cylinders', palette='coolwarm')  # Added hue to avoid warning
plt.xlabel("Number of Cylinders")
plt.ylabel("Miles Per Gallon (MPG)")
plt.title("MPG Distribution by Number of Cylinders")
plt.show()
