<a href="https://colab.research.google.com/github/angelo54425/practice-repo/blob/main/MLPipeline_Week1_Session1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import requests
from io import StringIO



In [2]:
""" Loading the data """

# Read the data
url = "https://pynative.com/wp-content/uploads/2019/01/Automobile_data.csv"

# Send request with headers
headers = {'User-Agent': 'Mozilla/5.0'}
response = requests.get(url, headers=headers)

# Load into pandas
df = pd.read_csv(StringIO(response.text))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61 entries, 0 to 60
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   index             61 non-null     int64  
 1   company           61 non-null     object 
 2   body-style        61 non-null     object 
 3   wheel-base        61 non-null     float64
 4   length            61 non-null     float64
 5   engine-type       61 non-null     object 
 6   num-of-cylinders  61 non-null     object 
 7   horsepower        61 non-null     int64  
 8   average-mileage   61 non-null     int64  
 9   price             58 non-null     float64
dtypes: float64(3), int64(3), object(4)
memory usage: 4.9+ KB


In [3]:
# Print the first and last five columns
print("The first column is:", df.columns[0])
print("The last 5 columns are:", df.columns[-5:])

The first column is: index
The last 5 columns are: Index(['engine-type', 'num-of-cylinders', 'horsepower', 'average-mileage',
       'price'],
      dtype='object')


In [4]:
most_afforadable_car = df.loc[df['price'].idxmin()]

print("The most affordable car is a {} from {} priced at ${:.2f}; Index number {}".format(
    most_afforadable_car['body-style'],
    most_afforadable_car['company'],
    most_afforadable_car['price'],
    most_afforadable_car['index']
))

The most affordable car is a hatchback from chevrolet priced at $5151.00; Index number 16


In [5]:
print(df['price'].describe())
print("\n", df['price'].isna().sum())

count       58.000000
mean     15387.000000
std      11320.259841
min       5151.000000
25%       6808.500000
50%      11095.000000
75%      18120.500000
max      45400.000000
Name: price, dtype: float64

 3


In [6]:
# Identify missing values and choose a method to replace them
# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)
# Replace missing values with the mean of the column
df['price'] = df.groupby('company')['price'].transform(lambda x: x.fillna(x.mean()))
#df.fillna(missing_values.mean(), inplace=True)
# Check if there are still any missing values
missing_values_after = df.isnull().sum()
print("Missing values after replacement:\n", missing_values_after)

Missing values in each column:
 index               0
company             0
body-style          0
wheel-base          0
length              0
engine-type         0
num-of-cylinders    0
horsepower          0
average-mileage     0
price               3
dtype: int64
Missing values after replacement:
 index               0
company             0
body-style          0
wheel-base          0
length              0
engine-type         0
num-of-cylinders    0
horsepower          0
average-mileage     0
price               0
dtype: int64


In [7]:
# Print the most expensive car, the company name, and the price.
most_expensive_car = df.loc[df['price'].idxmax()]

# Print the result
print("The most expensive car is a {} from {} priced at ${:.2f}".format(
    most_expensive_car['body-style'],
    most_expensive_car['company'],
    most_expensive_car['price']
))

The most expensive car is a hardtop from mercedes-benz priced at $45400.00


In [8]:
# Print the most affordable car, the company name, and the price.
most_afforadable_car = df.loc[df['price'].idxmin()]

print("The most affordable car is a {} from {} priced at ${:.2f}; Index number {}".format(
    most_afforadable_car['body-style'],
    most_afforadable_car['company'],
    most_afforadable_car['price'],
    most_afforadable_car['index']
))

# Print the price of the car with index 63
car_x = df[df['index'] == 63]
if not car_x.empty:
    print("Price of the car with index 63:", car_x.iloc[0]['price'])
else:
    print("No car found with index 63.")

The most affordable car is a hatchback from chevrolet priced at $5151.00; Index number 16
Price of the car with index 63: 35528.0


In [9]:
# Count total cars per company
car_count_per_company = df['company'].value_counts()
print( "Total cars per company:\n", car_count_per_company)

Total cars per company:
 company
toyota           7
bmw              6
mazda            5
nissan           5
mercedes-benz    4
audi             4
volkswagen       4
mitsubishi       4
chevrolet        3
jaguar           3
isuzu            3
honda            3
porsche          3
alfa-romero      3
dodge            2
volvo            2
Name: count, dtype: int64


In [10]:
# Find each company’s highest-priced car
highest_priced_cars = df.groupby('company')['price'].idxmax()
print("Highest priced cars per company:\n", df.loc[highest_priced_cars, ['company', 'body-style', 'price']])


Highest priced cars per company:
           company   body-style    price
1     alfa-romero  convertible  16500.0
6            audi        wagon  18920.0
11            bmw        sedan  41315.0
15      chevrolet        sedan   6575.0
16          dodge    hatchback   6377.0
19          honda        sedan  12945.0
21          isuzu        sedan   6785.0
26         jaguar        sedan  36000.0
31          mazda        sedan  18344.0
35  mercedes-benz      hardtop  45400.0
39     mitsubishi        sedan   8189.0
44         nissan        sedan  13499.0
46        porsche  convertible  37028.0
54         toyota        wagon  15750.0
58     volkswagen        sedan   9995.0
60          volvo        wagon  13415.0


In [11]:
# Find each company’s lowest-priced car
lowest_priced_cars = df.groupby('company')['price'].idxmin()
print("\nLowest priced cars per company:\n", df.loc[lowest_priced_cars, ['company', 'body-style', 'price']])


Lowest priced cars per company:
           company   body-style    price
0     alfa-romero  convertible  13495.0
3            audi        sedan  13950.0
7             bmw        sedan  16430.0
13      chevrolet    hatchback   5151.0
17          dodge    hatchback   6229.0
18          honda        wagon   7295.0
21          isuzu        sedan   6785.0
24         jaguar        sedan  32250.0
27          mazda    hatchback   5195.0
32  mercedes-benz        sedan  25552.0
36     mitsubishi    hatchback   5389.0
41         nissan        sedan   6649.0
45        porsche      hardtop  34028.0
48         toyota    hatchback   5348.0
55     volkswagen        sedan   7775.0
59          volvo        sedan  12940.0


In [18]:
# Find the average mileage of each car-making company
average_mileage_per_company = df.groupby('company')['average-mileage'].mean()
print("\nAverage mileage per company:\n", average_mileage_per_company.round(3))


Average mileage per company:
 company
alfa-romero      20.333
audi             20.000
bmw              19.000
chevrolet        41.000
dodge            31.000
honda            26.333
isuzu            33.333
jaguar           14.333
mazda            28.000
mercedes-benz    18.000
mitsubishi       29.500
nissan           31.400
porsche          17.000
toyota           28.714
volkswagen       31.750
volvo            23.000
Name: average-mileage, dtype: float64


In [24]:
# Sort all cars by the price column
sorted_cars_ascending = df.sort_values(by='price', ascending=True)
print("\nSorted cars by price:\n", sorted_cars_ascending[['company', 'body-style', 'price']].head(10))


Sorted cars by price:
        company body-style   price
13   chevrolet  hatchback  5151.0
27       mazda  hatchback  5195.0
48      toyota  hatchback  5348.0
36  mitsubishi  hatchback  5389.0
28       mazda  hatchback  6095.0
37  mitsubishi  hatchback  6189.0
17       dodge  hatchback  6229.0
14   chevrolet  hatchback  6295.0
49      toyota  hatchback  6338.0
16       dodge  hatchback  6377.0


In [25]:
# Sort all cars by the price column
sorted_cars_descending = df.sort_values(by='price', ascending=False)
print("\nSorted cars by price:\n", sorted_cars_descending[['company', 'body-style', 'price']].head(10))


Sorted cars by price:
           company   body-style    price
35  mercedes-benz      hardtop  45400.0
11            bmw        sedan  41315.0
34  mercedes-benz        sedan  40960.0
46        porsche  convertible  37028.0
12            bmw        sedan  36880.0
26         jaguar        sedan  36000.0
25         jaguar        sedan  35550.0
47        porsche    hatchback  35528.0
45        porsche      hardtop  34028.0
24         jaguar        sedan  32250.0
