IMPORTING PANDAS

In [2]:
# Importing pandas library as pd
import pandas as pd

WORKING WITH SERIES AND DATAFRAMES

In [3]:
# Series are 1-dimensional
series = pd.Series(["BMW", "Toyota", "Honda"])
series

0       BMW
1    Toyota
2     Honda
dtype: object

In [4]:
colors = pd.Series(["Red", "Blue", "White"])
colors

0      Red
1     Blue
2    White
dtype: object

In [5]:
# DataFrames are 2-dimensional
car_data = pd.DataFrame({"Car make":series,"Color":colors})
car_data

Unnamed: 0,Car make,Color
0,BMW,Red
1,Toyota,Blue
2,Honda,White


IMPORTING AND EXPORTING DATAFRAMES

In [6]:
# Importing car sales
car_sales = pd.read_csv("car-sales.csv")
car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [7]:
# Exporting car sales
car_sales.to_csv("exported-car-sales.csv", index=False)

# We set the index to false so that the DF is not exported with another index column.

In [8]:
# Importing exported-car-sales.csv
exported_car_sales = pd.read_csv("exported-car-sales.csv")
exported_car_sales

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [9]:
# You can also import csv files directly from a url
# go to course content, find heart disease data, select raw and copy the url

heart_disease = pd.read_csv("https://raw.githubusercontent.com/mrdbourke/zero-to-mastery-ml/master/data/heart-disease.csv")

# .head() returns the first 5 rows of a DataFrame unless specified otherwise
heart_disease.head(3)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1


ATTRIBUTES VS FUNCTIONS

A variable stored in an instance or class is called an attribute. A function stored in an instance or class is called a method.

Functions carry out some type of operation while attributes contain "meta" information.

\
**Attributes** are NOT followed by parentheses.

```
car_sales.dtypes
```

\
**Functions** are followed by parentheses.

```
car_sales.to_csv()
```

ATTRIBUTES

In [10]:
# .dtypes
# returns dtype of each column
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object

In [11]:
# .columns 
# returns column names as a list
car_sales.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [12]:
# .index
# returns start, stop and step of index
car_sales.index

RangeIndex(start=0, stop=10, step=1)

FUNCTIONS

In [13]:
# .describe()
# returns statistical information about our numeric columns
car_sales.describe()

Unnamed: 0,Odometer (KM),Doors
count,10.0,10.0
mean,78601.4,4.0
std,61983.471735,0.471405
min,11179.0,3.0
25%,35836.25,4.0
50%,57369.0,4.0
75%,96384.5,4.0
max,213095.0,5.0


In [16]:
# The 25%, 50% and 75% rows are the percentiles
# A percentile is a measure that indicates what percent of the given population scored at or below that measure.
import numpy as np
print(f'75% of the cars in our dataset have at least {np.percentile(car_sales["Doors"], 75)} doors')

75% of the cars in our dataset have at least 4.0 doors


In [17]:
# .info()
car_sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 528.0+ bytes


In [18]:
# .mean()
# returns the average of the numerical columns
car_sales.mean()

Odometer (KM)    78601.4
Doors                4.0
dtype: float64

In [19]:
# .mean() also works on series
car_prices = pd.Series([3000, 1500, 111250])
car_prices.mean()

38583.333333333336

In [16]:
# .sum()
# returns sum of column values
car_sales.sum()

Make             ToyotaHondaToyotaBMWNissanToyotaHondaHondaToyo...
Colour               WhiteRedBlueBlackWhiteGreenBlueBlueWhiteWhite
Odometer (KM)                                               786014
Doors                                                           40
Price            $4,000.00$5,000.00$7,000.00$22,000.00$3,500.00...
dtype: object

In [17]:
# We can select for a single column
car_sales["Doors"].sum()

40

In [18]:
# len()
# returns length of the DF
len(car_sales)

10