## Essential Operations with Data Frames

In [1]:
import pandas as pd

### Loading data from files

In [2]:
!type data.csv

"NAME","AGE","LANGUAGE"
"Alice",30,"English"
"Bob",25,"Spanish"
"Charlie",35,"French"


In [3]:
data = pd.read_csv('data.csv')
data

Unnamed: 0,NAME,AGE,LANGUAGE
0,Alice,30,English
1,Bob,25,Spanish
2,Charlie,35,French


In [4]:
!type movie.json

{
    "title": "Fight Club",
    "watched": true,
    "year": 1999,
    "actors": [
        "Brad Pitt",
        "Edward Norton",
        "Helena Bonham Carter"
    ]
}


In [5]:
data = pd.read_json('movie.json')
data

Unnamed: 0,actors,title,watched,year
0,Brad Pitt,Fight Club,True,1999
1,Edward Norton,Fight Club,True,1999
2,Helena Bonham Carter,Fight Club,True,1999


In [6]:
!type movies-90s.jsonl

{"title": "Fight Club", "year": 1999, "actors": ["Brad Pitt", "Edward Norton", "Helena Bonham Carter"], "watched": true}
{"title": "Goodfellas", "year": 1990, "actors": ["Robert De Niro", "Ray Liotta", "Joe Pesci"], "watched": true}
{"title": "Forrest Gump", "year": 1994, "actors": ["Tom Hanks", "Robin Wright"], "watched": true}



In [7]:
data = pd.read_json('movies-90s.jsonl', lines=True)
data

Unnamed: 0,actors,title,watched,year
0,"[Brad Pitt, Edward Norton, Helena Bonham Carter]",Fight Club,True,1999
1,"[Robert De Niro, Ray Liotta, Joe Pesci]",Goodfellas,True,1990
2,"[Tom Hanks, Robin Wright]",Forrest Gump,True,1994


### Reindexing

Reindexing is the process of creating a new object with the data conformed to a new index

In [8]:
data = pd.Series([3, 1, 2], index=['b', 'a', 'd'])
data

b    3
a    1
d    2
dtype: int64

In [9]:
new_data = data.reindex(['a', 'b', 'c', 'd'])
new_data

a    1.0
b    3.0
c    NaN
d    2.0
dtype: float64

### Applying a function



In [10]:
data = pd.DataFrame([[4, 36, 1], [9, 25, 16]],
                    columns=['A', 'B', 'C'],
                    index=['Red', 'Blue'])
data

Unnamed: 0,A,B,C
Red,4,36,1
Blue,9,25,16


In [11]:
import numpy as np

np.sqrt(data)

Unnamed: 0,A,B,C
Red,2.0,6.0,1.0
Blue,3.0,5.0,4.0


In [12]:
def double_up(x):
    return x * 2

data.applymap(double_up)

Unnamed: 0,A,B,C
Red,8,72,2
Blue,18,50,32


In [13]:
data

Unnamed: 0,A,B,C
Red,4,36,1
Blue,9,25,16


In [14]:
def difference(x):
    return x.max() - x.min()

data.apply(difference)

A     5
B    11
C    15
dtype: int64

In [15]:
data.apply(difference, axis=1)

Red     35
Blue    16
dtype: int64

### Sorting

In [16]:
data

Unnamed: 0,A,B,C
Red,4,36,1
Blue,9,25,16


In [17]:
data.sort_index()  # sort by row labels, ascending

Unnamed: 0,A,B,C
Blue,9,25,16
Red,4,36,1


In [18]:
data.sort_index(axis=1,           # sort by column labels
                ascending=False)  # descending

Unnamed: 0,C,B,A
Red,1,36,4
Blue,16,25,9


In [19]:
data

Unnamed: 0,A,B,C
Red,4,36,1
Blue,9,25,16


In [20]:
data.sort_values(by='B')

Unnamed: 0,A,B,C
Blue,9,25,16
Red,4,36,1


In [21]:
data.sort_values(by='Blue', axis=1)

Unnamed: 0,A,C,B
Red,4,1,36
Blue,9,16,25


### Handling missing data

In [22]:
data = pd.Series([1, 2, np.nan, 3, np.nan])

data

0    1.0
1    2.0
2    NaN
3    3.0
4    NaN
dtype: float64

In [23]:
data == None

0    False
1    False
2    False
3    False
4    False
dtype: bool

In [24]:
data.isnull()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [25]:
data.notnull()

0     True
1     True
2    False
3     True
4    False
dtype: bool

#### Filtering out missing data

In [26]:
data.dropna()

0    1.0
1    2.0
3    3.0
dtype: float64

In [27]:
data[data.notnull()]

0    1.0
1    2.0
3    3.0
dtype: float64

#### Filling in missing data

In [28]:
data.fillna(0)

0    1.0
1    2.0
2    0.0
3    3.0
4    0.0
dtype: float64

In [29]:
data.fillna(data.mean())

0    1.0
1    2.0
2    2.0
3    3.0
4    2.0
dtype: float64

In [30]:
data.fillna({2: 100, 4: 500})

0      1.0
1      2.0
2    100.0
3      3.0
4    500.0
dtype: float64

#### Notice

All the functions discussed here return a *new* pandas object

If we need to change the object in place, we need `inplace=True`