In [3]:
import pandas as pd


## The two main data types
1. Series -> 1 dimensional python list
2. Dataframes -> 2 dimensional python list making python dictionary

In [4]:
# Series 1
colors = pd.Series(['Red', 'Blue', 'Green'])
colors

0      Red
1     Blue
2    Green
dtype: object

In [5]:
# Series 2
cars = pd.Series(['Mercedes', 'Tesla', 'Honda'])
cars


0    Mercedes
1       Tesla
2       Honda
dtype: object

In [7]:
# DataFrame = 2 series together
car_data = pd.DataFrame({'Brand': cars, 'Colours': colors})
car_data

Unnamed: 0,Brand,Colours
0,Mercedes,Red
1,Tesla,Blue
2,Honda,Green



## Importing Data(csv) to DataFrames

In [17]:
car_sales = pd.read_csv('car-sales.csv')
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"



## Anatomy to DataFrame

![](pandas-anatomy-of-a-dataframe.png)


## Exporting a DataFrame
DataFrame's to .csv format using .to_csv() or spreadsheet format using .to_excel()

In [10]:
car_sales.to_csv('exported-car-sales.csv')

In [11]:
# Imported the exported csv file
reimport = pd.read_csv('exported-car-sales.csv')
reimport

Unnamed: 0.1,Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,0,Toyota,White,150043,4,"$4,000.00"
1,1,Honda,Red,87899,4,"$5,000.00"
2,2,Toyota,Blue,32549,3,"$7,000.00"
3,3,BMW,Black,11179,5,"$22,000.00"
4,4,Nissan,White,213095,4,"$3,500.00"
5,5,Toyota,Green,99213,4,"$4,500.00"
6,6,Honda,Blue,45698,4,"$7,500.00"
7,7,Honda,Blue,54738,4,"$7,000.00"
8,8,Toyota,White,60000,4,"$6,250.00"
9,9,Nissan,White,31600,4,"$9,700.00"


As you can see above it has added a index column by default in a csv file, we can import it wothout adding additional index

In [13]:
# Exporting DataFrame without additional index
car_sales.to_csv('new-exported-carSales.csv', index=False)

In [16]:
exported_without_index = pd.read_csv('new-exported-carSales.csv')
exported_without_index

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"



## Data from URLs

If we dont have csv files in our notebook, we can directly import it from the URL also using pandas. 
1. use the direct URL/Google sheets if hosted online
2. If reading the csv file from Github, use the 'RAW' link of the csv file

In [4]:
missing_car_sales = pd.read_csv('https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/refs/heads/master/data/car-sales-missing-data.csv')
missing_car_sales

Unnamed: 0,Make,Colour,Odometer,Doors,Price
0,Toyota,White,150043.0,4.0,"$4,000"
1,Honda,Red,87899.0,4.0,"$5,000"
2,Toyota,Blue,,3.0,"$7,000"
3,BMW,Black,11179.0,5.0,"$22,000"
4,Nissan,White,213095.0,4.0,"$3,500"
5,Toyota,Green,,4.0,"$4,500"
6,Honda,,,4.0,"$7,500"
7,Honda,Blue,,4.0,
8,Toyota,White,60000.0,,
9,,White,31600.0,4.0,"$9,700"



## Describing Data
How to describe data types in dataframe

In [9]:
#Attribute -> which doesn't have brackets
car_sales.dtypes

#Functions -> have brackets

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [10]:
car_columns = car_sales.columns
car_columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [11]:
car_sales.index

RangeIndex(start=0, stop=10, step=1)

In [13]:
car_sales.describe()
# describe() works only works on numeric columns

Unnamed: 0,Odometer (KM),Doors
count,10.0,10.0
mean,78601.4,4.0
std,61983.471735,0.471405
min,11179.0,3.0
25%,35836.25,4.0
50%,57369.0,4.0
75%,96384.5,4.0
max,213095.0,5.0


In [14]:
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 532.0+ bytes


In [20]:
car_sales.sum()

Make             ToyotaHondaToyotaBMWNissanToyotaHondaHondaToyo...
Colour               WhiteRedBlueBlackWhiteGreenBlueBlueWhiteWhite
Odometer (KM)                                               786014
Doors                                                           40
Price            $4,000.00$5,000.00$7,000.00$22,000.00$3,500.00...
dtype: object

In [21]:
car_sales['Doors'].sum()

np.int64(40)

In [22]:
car_sales['Doors'].mean()

np.float64(4.0)

In [23]:
len(car_sales)

10