# Introduction To Pandas

## What is Pandas?

Pandas is used in DataScience to aid with the visualisation of data but more importantly, modifying and cleaning the data for use with other tools, such as NumPy.

In [4]:
# --- Import pandas
import pandas as pd

## Data Types

There are two main datatypes with Pandas:
- Series: One-dimensional (basically a list/array of single values)
- DataFrame: Two-dimensional (rows and columns)

In [7]:
# --- Create a pandas series with a list of car brands:
series = pd.Series(["BMW", "Toyota", "Honda"])

# --- Show the contents of series:
print(series)

0       BMW
1    Toyota
2     Honda
dtype: object


In [8]:
# --- Create another series of colours:
colours = pd.Series(["Red", "Blue", "White"])

# --- Show the contents of colours:
print(colours)

0      Red
1     Blue
2    White
dtype: object


In [9]:
# --- Create a dataframe that uses the two series created previously:
car_data = pd.DataFrame({"car": series, "colour": colours})
print(car_data)

      car colour
0     BMW    Red
1  Toyota   Blue
2   Honda  White


## Importing Data

Rather than creating a DF from a number of series', you can import data from files, such as a CSV file and create a DF from that files contents.

In [5]:
# --- Read a CSV file, create a DF and display the values:
car_sales = pd.read_csv("./car-sales.csv")
print(car_sales)

     Make Colour  Odometer (KM)  Doors       Price
0  Toyota  White         150043      4   $4,000.00
1   Honda    Red          87899      4   $5,000.00
2  Toyota   Blue          32549      3   $7,000.00
3     BMW  Black          11179      5  $22,000.00
4  Nissan  White         213095      4   $3,500.00
5  Toyota  Green          99213      4   $4,500.00
6   Honda   Blue          45698      4   $7,500.00
7   Honda   Blue          54738      4   $7,000.00
8  Toyota  White          60000      4   $6,250.00
9  Nissan  White          31600      4   $9,700.00


## Exporting Data

You can export to a number of formats, including CSV, Excel, HTML and JSON.

In [13]:
# --- Export a DF to a CSV file (index=False will remove the index column):
car_sales.to_csv("./car-sales-export.csv", index=False)

## Describing Data

You can describe a number of features of a DF, include the data types and column names, just to name a few.

In [8]:
# --- Show what each columns data type is:
print(car_sales.dtypes)

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price            object
dtype: object


In [7]:
# --- Show a list of all the column names:
print(car_sales.columns)

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [9]:
# --- Show information relating to the index that is used by the DF:
print(car_sales.index)

RangeIndex(start=0, stop=10, step=1)


In [14]:
# --- Run a number of functions against all of the numerical columns in the DF (Price is not numeric):
print(car_sales.describe())

       Odometer (KM)      Doors
count      10.000000  10.000000
mean    78601.400000   4.000000
std     61983.471735   0.471405
min     11179.000000   3.000000
25%     35836.250000   4.000000
50%     57369.000000   4.000000
75%     96384.500000   4.000000
max    213095.000000   5.000000


In [16]:
# --- Show a summary of the DF, which includes a summary count for not-null columns, 
# --- which is useful for finding (at a glance) any potential null values:
print(car_sales.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Make           10 non-null     object
 1   Colour         10 non-null     object
 2   Odometer (KM)  10 non-null     int64 
 3   Doors          10 non-null     int64 
 4   Price          10 non-null     object
dtypes: int64(2), object(3)
memory usage: 528.0+ bytes
None


In [35]:
# --- Get the mean for all the numeric columns:
car_sales.mean(numeric_only=True)

Odometer (KM)    78601.4
Doors                4.0
dtype: float64

In [34]:
# --- Alternatively, you can get the mean for the Odometer (KM) and Doors columns
# --- by specify the column name to use:
print(f'Odometer (KM): {car_sales["Odometer (KM)"].mean()}')
print(f'Doors: {car_sales["Doors"].mean()}')

Odometer (KM): 78601.4
Doors: 4.0


In [37]:
# --- Sum up the values of numeric columns:
car_sales.sum(numeric_only=True)

Odometer (KM)    786014
Doors                40
dtype: int64

In [40]:
# --- Show the number of rows:
print(f'Total Rows: {len(car_sales)}')

Total Rows: 10


## Viewing and Selecting Data

In [56]:
# --- Show the first five rows in a dataframe (put a number in the () to get more):
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [44]:
# --- Show the last five rows in a dataframe (put a number in the () to get more):
car_sales.tail()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
5,Toyota,Green,99213,4,"$4,500.00"
6,Honda,Blue,45698,4,"$7,500.00"
7,Honda,Blue,54738,4,"$7,000.00"
8,Toyota,White,60000,4,"$6,250.00"
9,Nissan,White,31600,4,"$9,700.00"


In [46]:
# --- Create a series with a custom index for each item in the series:
pets = pd.Series(["cat", "dog", "bird", "panda", "snake"], 
                 index=[0, 2, 5, 8, 5])
pets

0      cat
2      dog
5     bird
8    panda
5    snake
dtype: object

In [48]:
# --- Locate and display any items in pets that have an index of 5.
# --- This is useful for checking for duplicate index numbers:
pets.loc[5]

5     bird
5    snake
dtype: object

In [None]:
# --- Locate and display any items in pets that have an index value of 5.
# --- This is useful for checking for duplicate index numbers:
pets.loc[5]

In [49]:
# --- Locate and display any item in car_sales DF that have an index value of 5.
car_sales.loc[5]

Make                Toyota
Colour               Green
Odometer (KM)        99213
Doors                    4
Price            $4,500.00
Name: 5, dtype: object

In [55]:
# --- Locate and display any item in the pets series that have an index position of 1.
pets.iloc[1]

'dog'

In [59]:
# --- Locate and display any item in car_sales DF that have an index position of 1.
car_sales.iloc[1]

Make                 Honda
Colour                 Red
Odometer (KM)        87899
Doors                    4
Price            $5,000.00
Name: 1, dtype: object

In [57]:
# --- Slice the pets series up to show only upto index 3 (shows positions 0 - 2):
pets.iloc[:3]

0     cat
2     dog
5    bird
dtype: object

In [66]:
# --- Show only the contents of the Make column (case specific):
car_sales["Make"]

# --- Or you can use (note: dot notation does not work with columns that have spaces):
car_sales.Make

0    Toyota
1     Honda
2    Toyota
3       BMW
4    Nissan
5    Toyota
6     Honda
7     Honda
8    Toyota
9    Nissan
Name: Make, dtype: object

In [67]:
# --- Show all the cars where the the Make column is equal to Toyota:
car_sales[car_sales["Make"] == "Toyota"]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
5,Toyota,Green,99213,4,"$4,500.00"
8,Toyota,White,60000,4,"$6,250.00"


In [68]:
# --- Show all the cars where the the odometer is > 100000:
car_sales[car_sales["Odometer (KM)"] > 100000]

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
4,Nissan,White,213095,4,"$3,500.00"
