In [1]:
import numpy as np
import pandas as pd


In [3]:

# Address of the CSV file
address = '/content/sample_data/mtcars.csv'

# Read the CSV file
cars = pd.read_csv(address)

# Correct column names
cars.columns = ['car_names', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb', 'extra']

#cars.index accesses the index of the DataFrame.
#cars.car_names accesses the car_names column.
cars.index = cars.car_names

#displays the first 15 rows and the first 12 columns of the DataFrame
#cars.iloc is used for integer-location based indexing.
#[:, 0:12] selects all rows (:) and the first 12 columns (0:12).
cars.iloc[:, 0:12].head(15)

Unnamed: 0_level_0,car_names,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
car_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
Mazda RX4,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2
Valiant,Valiant,18.1,6,225.0,105,2.76,3.46,20.22,1,0,3,1
Duster 360,Duster 360,14.3,8,360.0,245,3.21,3.57,15.84,0,0,3,4
Merc 240D,Merc 240D,24.4,4,146.7,62,3.69,3.19,20.0,1,0,4,2
Merc 230,Merc 230,22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2
Merc 280,Merc 280,19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4


In [4]:
# extracts the 'carb' column from the 'cars'
carb = cars.carb
carb.value_counts()

carb
4    10
2    10
1     7
3     3
6     1
8     1
Name: count, dtype: int64

# Extracting Specific Columns and Analysing

In [5]:

cars_cat = cars[['cyl','vs','am','gear','carb']]
cars_cat.head()

Unnamed: 0_level_0,cyl,vs,am,gear,carb
car_names,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Mazda RX4,6,0,1,4,4
Mazda RX4 Wag,6,0,1,4,4
Datsun 710,4,1,1,4,1
Hornet 4 Drive,6,1,0,3,1
Hornet Sportabout,8,0,0,3,2


In [7]:
#Grouping by 'gear'
gears_group = cars_cat.groupby('gear')


In [8]:
#  It creates groups of rows where each group corresponds to a unique value in
#the 'gear' column.
gears_group.describe()

Unnamed: 0_level_0,cyl,cyl,cyl,cyl,cyl,cyl,cyl,cyl,vs,vs,...,am,am,carb,carb,carb,carb,carb,carb,carb,carb
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
gear,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
3,15.0,7.466667,1.187234,4.0,8.0,8.0,8.0,8.0,15.0,0.2,...,0.0,0.0,15.0,2.666667,1.175139,1.0,2.0,3.0,4.0,4.0
4,12.0,4.666667,0.984732,4.0,4.0,4.0,6.0,6.0,12.0,0.833333,...,1.0,1.0,12.0,2.333333,1.302678,1.0,1.0,2.0,4.0,4.0
5,5.0,6.0,2.0,4.0,4.0,6.0,8.0,8.0,5.0,0.2,...,1.0,1.0,5.0,4.4,2.607681,2.0,2.0,4.0,6.0,8.0


# The categorical data type in pandas is used for:
Columns with a limited set of possible values (like gear numbers in this case).
More efficient storage for repeated string values.
Improved performance for certain operations.


In [10]:
#  creates a new column in the cars DataFrame named 'group
# cars.gear: This accesses the 'gear' column from the cars DataFrame.
#pd.Series(): This creates a new pandas Series object.
#dtype="category": This specifies that the new Series should have a categorical data type.


cars['group'] = pd.Series(cars.gear, dtype="category")
cars['group']

car_names
Mazda RX4              4
Mazda RX4 Wag          4
Datsun 710             4
Hornet 4 Drive         3
Hornet Sportabout      3
Valiant                3
Duster 360             3
Merc 240D              4
Merc 230               4
Merc 280               4
Merc 280C              4
Merc 450SE             3
Merc 450SL             3
Merc 450SLC            3
Cadillac Fleetwood     3
Lincoln Continental    3
Chrysler Imperial      3
Fiat 128               4
Honda Civic            4
Toyota Corolla         4
Toyota Corona          3
Dodge Challenger       3
AMC Javelin            3
Camaro Z28             3
Pontiac Firebird       3
Fiat X1-9              4
Porsche 914-2          5
Lotus Europa           5
Ford Pantera L         5
Ferrari Dino           5
Maserati Bora          5
Volvo 142E             4
Name: group, dtype: category
Categories (3, int64): [3, 4, 5]

In [11]:
cars['group'].dtypes

CategoricalDtype(categories=[3, 4, 5], ordered=False)

In [14]:
# counts the occurrence of each unique value in a Series(in this cas new gear)
cars['group'].value_counts()

group
3    15
4    12
5     5
Name: count, dtype: int64

In [15]:
# pd.crosstab() creates a cross-tabulation (contingency table) of the 'am' (transmission type)
#and 'gear' (number of gears) columns
pd.crosstab(cars['am'], cars['gear'])

gear,3,4,5
am,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,15,4,0
1,0,8,5


In [None]:
"""
gear   3   4   5
am
0     15   4   0
1      0   8   5

Interpretation of the Example Output
Rows: Represent the 'am' column (transmission type).
        0 means automatic transmission.
        1 means manual transmission.
Columns: Represent the 'gear' column (number of gears).
        3, 4, 5 are the possible numbers of gears.
For example, there are 15 cars with automatic transmission and 3 gears.
There are 8 cars with manual transmission and 4 gears.

"""