# Intro to data tools

## Hello world

In [1]:
print('Hello world')

Hello world


## Creating a variable

In [2]:
x = 2

# Working with Pandas

## Importing the library

In [3]:
import pandas as pd

## Pandas series
### Creating a series

In [4]:
flower_list = ['Sunflower', 'Orchid', 'Cup milk', 'Lily', 'Rose', 'Violet']
flower_series = pd.Series(flower_list)
flower_series.name = 'Flowers'
flower_series

0    Sunflower
1       Orchid
2     Cup milk
3         Lily
4         Rose
5       Violet
Name: Flowers, dtype: object

## Pandas dataframe

### Creating a list

In [5]:
pizza_list = [['Margherita', 'Y', 22.0], ['Pepperoni', 'N', 27.0], ['Ham and cheese', 'N', 20.0], [], ['Hawaiian', 'N', 25.0]]
pizza_list.append(['Four cheese', 'Y', 27.0])
pizza_list.append(['', '', ''])
pizza_list.append(['Mushroom', 'Y', 25.0])
pizza_list

[['Margherita', 'Y', 22.0],
 ['Pepperoni', 'N', 27.0],
 ['Ham and cheese', 'N', 20.0],
 [],
 ['Hawaiian', 'N', 25.0],
 ['Four cheese', 'Y', 27.0],
 ['', '', ''],
 ['Mushroom', 'Y', 25.0]]

### Turning the list into a dataframe

In [6]:
pizza_dataframe = pd.DataFrame(pizza_list)
pizza_dataframe

Unnamed: 0,0,1,2
0,Margherita,Y,22.0
1,Pepperoni,N,27.0
2,Ham and cheese,N,20.0
3,,,
4,Hawaiian,N,25.0
5,Four cheese,Y,27.0
6,,,
7,Mushroom,Y,25.0


### Adding names to the columns

In [7]:
pizza_dataframe.columns = ['Flavor', 'Is it vegetarian?', 'Price']
pizza_dataframe

Unnamed: 0,Flavor,Is it vegetarian?,Price
0,Margherita,Y,22.0
1,Pepperoni,N,27.0
2,Ham and cheese,N,20.0
3,,,
4,Hawaiian,N,25.0
5,Four cheese,Y,27.0
6,,,
7,Mushroom,Y,25.0


### Showing the dataframe info

.head(n) shows the first n data

In [8]:
pizza_dataframe.head(3)

Unnamed: 0,Flavor,Is it vegetarian?,Price
0,Margherita,Y,22.0
1,Pepperoni,N,27.0
2,Ham and cheese,N,20.0


.tail(n) shows the last n data

In [9]:
pizza_dataframe.tail(3)

Unnamed: 0,Flavor,Is it vegetarian?,Price
5,Four cheese,Y,27.0
6,,,
7,Mushroom,Y,25.0


.shape shows the number of columns and rows

In [10]:
pizza_dataframe.shape

(8, 3)

.info() shows a compilation of the dataframe information

In [11]:
pizza_dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Flavor             7 non-null      object
 1   Is it vegetarian?  7 non-null      object
 2   Price              7 non-null      object
dtypes: object(3)
memory usage: 320.0+ bytes


OBS:

The null value refers to the line of index 3, that displays 'None'

```
['', ''] is not the same as []
```
['',''] is a list with string values, but with no characters

[] is an empty list

### Slicing and dicing the dataframe

#### Selecting a column

In [12]:
pizza_dataframe['Price']

0    22.0
1    27.0
2    20.0
3    None
4    25.0
5    27.0
6        
7    25.0
Name: Price, dtype: object

#### Selecting multiple columns

In [13]:
pizza_dataframe[['Price', 'Flavor']]

Unnamed: 0,Price,Flavor
0,22.0,Margherita
1,27.0,Pepperoni
2,20.0,Ham and cheese
3,,
4,25.0,Hawaiian
5,27.0,Four cheese
6,,
7,25.0,Mushroom


#### Selecting a specific location inside the dataframe

In [14]:
print(pizza_dataframe.iloc[0,0])
print(pizza_dataframe.iloc[5,2])

Margherita
27.0


#### Selecting a range inside the dataframe

##### Index location: .iloc() - Uses indexes to set ranges of data

Restricting both the columns and rows

In [15]:
pizza_dataframe.iloc[2:5, 0:2]

Unnamed: 0,Flavor,Is it vegetarian?
2,Ham and cheese,N
3,,
4,Hawaiian,N


In [16]:
pizza_dataframe.iloc[2:5, [0,2]]

Unnamed: 0,Flavor,Price
2,Ham and cheese,20.0
3,,
4,Hawaiian,25.0


Restricting only the columns

In [17]:
pizza_dataframe.iloc[:, 0:2]

Unnamed: 0,Flavor,Is it vegetarian?
0,Margherita,Y
1,Pepperoni,N
2,Ham and cheese,N
3,,
4,Hawaiian,N
5,Four cheese,Y
6,,
7,Mushroom,Y


Restricting only the rows

In [18]:
pizza_dataframe.iloc[1:4, :]

Unnamed: 0,Flavor,Is it vegetarian?,Price
1,Pepperoni,N,27.0
2,Ham and cheese,N,20.0
3,,,


With no restrictions

In [19]:
pizza_dataframe.iloc[:,:]

Unnamed: 0,Flavor,Is it vegetarian?,Price
0,Margherita,Y,22.0
1,Pepperoni,N,27.0
2,Ham and cheese,N,20.0
3,,,
4,Hawaiian,N,25.0
5,Four cheese,Y,27.0
6,,,
7,Mushroom,Y,25.0


##### Location: .loc() - Uses indexes and columns names to set ranges of data

Selecting all the rows and restricting to the column 'Flavor'

In [20]:
pizza_dataframe.loc[:, 'Flavor']

0        Margherita
1         Pepperoni
2    Ham and cheese
3              None
4          Hawaiian
5       Four cheese
6                  
7          Mushroom
Name: Flavor, dtype: object

Restricting the rows and restricting to the column 'Flavor'

In [21]:
pizza_dataframe.loc[3:7, 'Flavor']

3           None
4       Hawaiian
5    Four cheese
6               
7       Mushroom
Name: Flavor, dtype: object

Restricting the rows and restricting to the columns 'Price' and 'Flavor'

In [22]:
pizza_dataframe.loc[3:7, ['Price', 'Flavor']]

Unnamed: 0,Price,Flavor
3,,
4,25.0,Hawaiian
5,27.0,Four cheese
6,,
7,25.0,Mushroom


### Dealing with .csv files

#### Importing the file

In [23]:
airports_df = pd.read_csv('./data/airports.csv')
airports_df

Unnamed: 0,Name,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,Heathrow,London,United Kingdom
3,Schiphol,Amsterdam,Netherlands
4,Changi,Singapore,Singapore
5,Pearson,Toronto,Canada
6,Narita,Tokyo,Japan


#### Skipping rows with errors

In [24]:
airports_df_invalid_row = pd.read_csv('./data/airports_invalid_row.csv', on_bad_lines = 'skip')
airports_df_invalid_row

Unnamed: 0,Seattle-Tacoma,Seattle,USA
0,Dulles,Washington,USA
1,Schiphol,Amsterdam,Netherlands
2,Changi,Singapore,Singapore
3,Pearson,Toronto,Canada
4,Narita,Tokyo,Japan


#### Dealing with files without header

Using indexes for the columns

In [25]:
airports_df_no_header = pd.read_csv('./data/airports_no_header.csv', header = None)
airports_df_no_header

Unnamed: 0,0,1,2
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,Heathrow,London,United Kingdom
3,Schiphol,Amsterdam,Netherlands
4,Changi,Singapore,Singapore
5,Pearson,Toronto,Canada
6,Narita,Tokyo,Japan


Adding the columns name during the importation

In [26]:
airports_df_with_header = pd.read_csv('./data/airports_no_header.csv', header = None, names=['Airport', 'City', 'Country'])
airports_df_with_header

Unnamed: 0,Airport,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,Heathrow,London,United Kingdom
3,Schiphol,Amsterdam,Netherlands
4,Changi,Singapore,Singapore
5,Pearson,Toronto,Canada
6,Narita,Tokyo,Japan


#### Missing values: NaN

In [27]:
airports_df_nan = pd.read_csv('./data/airports_nan.csv')
airports_df_nan

Unnamed: 0,Name,City,Country
0,Seattle-Tacoma,Seattle,USA
1,Dulles,Washington,USA
2,Heathrow,London,United Kingdom
3,Schiphol,Amsterdam,Netherlands
4,Changi,,Singapore
5,Pearson,Toronto,Canada
6,Narita,Tokyo,Japan


#### Exporting a dataframe as .csv

Exporting the dataframe with the index

In [28]:
airports_df_with_header.to_csv('./data/airports_with_header.csv')

Exporting the dataframe without the index

In [29]:
airports_df_with_header.to_csv('./data/airports_with_header_no_index.csv', index = False)