# Data structures
### Series

In [1]:
import pandas as pd

s = pd.Series([45, 34, 78, 65, 89])
print(s)

0    45
1    34
2    78
3    65
4    89
dtype: int64


I can directly access the index and the values of my series s

In [4]:
print(s.index)
print(s.values)

RangeIndex(start=0, stop=5, step=1)
[45 34 78 65 89]


Comparison with numpy arrays

In [9]:
import numpy as np

x = np.array([45, 34, 78, 65, 89])
print(x)
print(s.values)

print(type(s.values), type(x)) #The s values and x are of the same class, numpy.ndarry 

[45 34 78 65 89]
[45 34 78 65 89]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In pandas, I can use arbitrary indices, this is shown in the cell below.

In [12]:
fruits = ['mangoes', 'apples', 'oranges', 'guavas']
quantities = [20, 10, 45, 32]
quantities1 = [17, 13, 31, 32]

x = pd.Series(quantities, index=fruits)
x1 = pd.Series(quantities1, index=fruits)
print(x + x1) #I'll get a new series with the same indices, the corresponding values will be added
print(f"Some of x = {sum(x)}")

mangoes    37
apples     23
oranges    76
guavas     64
dtype: int64
Some of x = 107


The indices does not have to be the same for series addition, if the index is not the same for both Series, the value of the series will be NaN 

In [15]:
f1 = ['peaches', 'apples', 'oranges', 'lemon']
f2 = ['mangoes', 'apples', 'oranges', 'lemon']
q1 = [32, 10, 5, 6]
q2 = [21, 54, 70, 3]

s1 = pd.Series(q1, index=f1)
s2 = pd.Series(q2, index=f2)
print(s1 + s2)

apples     64.0
lemon       9.0
mangoes     NaN
oranges    75.0
peaches     NaN
dtype: float64


### Indexing
It is possible to access single and multiple values of a Series object.

In [19]:
print(s1['apples'])
print(s2[['mangoes', 'apples', 'oranges']])

10
mangoes    21
apples     54
oranges    70
dtype: int64


Similar to numpy, I can use scalar operations and mathematical functions on Series object.

In [22]:
print((s1 + 3) * 2)
print(np.sin(s2))

peaches    70
apples     26
oranges    16
lemon      18
dtype: int64
mangoes    0.836656
apples    -0.558789
oranges    0.773891
lemon      0.141120
dtype: float64


### Pandas.Series.Apply

In [29]:
s1.apply(np.log) 

peaches    3.465736
apples     2.302585
oranges    1.609438
lemon      1.791759
dtype: float64

In [24]:
s2.apply(lambda x: x if x > 50 else x + 10)

mangoes    31
apples     54
oranges    70
lemon      13
dtype: int64

### Filtering with a boolean array

In [27]:
print(s1)
s1[s1<20]

peaches    32
apples     10
oranges     5
lemon       6
dtype: int64


apples     10
oranges     5
lemon       6
dtype: int64

In [30]:
"apples" in s1

True

### Creating Series objects from dictionaries

In [3]:
import pandas as pd
cities = {  "London": 8615246,
            "Berlin": 3562166,
            "Madrid": 3165235,
            "Rome": 2874038,
            "Paris": 2273305,
            "Vienna": 1805681,
            "Bucharest": 1803425,
            "Hamburg": 1760433,
            "Budapest": 1754000,
            "Warsaw": 1740119,
            "Barcelona": 1602386,
            "Munich": 1493900,
            "Milan": 1350680     }
y = pd.Series(cities)
print(y)

London       8615246
Berlin       3562166
Madrid       3165235
Rome         2874038
Paris        2273305
Vienna       1805681
Bucharest    1803425
Hamburg      1760433
Budapest     1754000
Warsaw       1740119
Barcelona    1602386
Munich       1493900
Milan        1350680
dtype: int64


### NaN - Missing data

In [4]:
my_cities = ["London", "Paris", "Zurich", "Berlin", "Stuttgart", "Hamburg"]

my_city_series = pd.Series(cities, index=my_cities)
my_city_series #due to the NaN values, the population values of the other cities are turned into float

London       8615246.0
Paris        2273305.0
Zurich             NaN
Berlin       3562166.0
Stuttgart          NaN
Hamburg      1760433.0
dtype: float64

### The methods Isnull() and Notnull()
I can check for the missing values with the methods isnull() and notnull()

In [5]:
print(my_city_series.isnull())

London       False
Paris        False
Zurich        True
Berlin       False
Stuttgart     True
Hamburg      False
dtype: bool


In [6]:
print(my_city_series.notnull())

London        True
Paris         True
Zurich       False
Berlin        True
Stuttgart    False
Hamburg       True
dtype: bool


### Connection between NaN and None
We also get NaN if the dictionary value is None

In [7]:
d = {'a':4, 'b':None, 'c':5, 'd':20}
s = pd.Series(d)
print(s)
pd.isnull(s)

a     4.0
b     NaN
c     5.0
d    20.0
dtype: float64


a    False
b     True
c    False
d    False
dtype: bool

### Filtering out missing data

In [8]:
print(my_city_series.dropna())

London     8615246.0
Paris      2273305.0
Berlin     3562166.0
Hamburg    1760433.0
dtype: float64


### Filling in missing data

In [9]:
print(my_city_series.fillna(0))

London       8615246.0
Paris        2273305.0
Zurich             0.0
Berlin       3562166.0
Stuttgart          0.0
Hamburg      1760433.0
dtype: float64


If I call fillna with a dict, I can provide the appropriate data, i.e. the population of Zurich and Stuttgart:

In [10]:
missing_cities = {"Stuttgart":597939, "Zurich":378884}
print(my_city_series.fillna(missing_cities))

London       8615246.0
Paris        2273305.0
Zurich        378884.0
Berlin       3562166.0
Stuttgart     597939.0
Hamburg      1760433.0
dtype: float64


In [11]:
my_city_series = my_city_series.fillna(missing_cities).astype(int)
print(my_city_series)

London       8615246
Paris        2273305
Zurich        378884
Berlin       3562166
Stuttgart     597939
Hamburg      1760433
dtype: int32


### DataFrame
Series can be arranged and concatenated into a DataFrame

In [12]:
years = range(2014, 2018)
s1 = pd.Series([2409.14, 2941.01, 3496.83, 3119.55], index=years)
s2 = pd.Series([1203.45, 3441.62, 3007.83, 3619.53], index=years)
s3 = pd.Series([3412.12, 3491.16, 3457.19, 1963.10], index=years)

s_df = pd.concat([s1, s2, s3], axis=1)
s_df

Unnamed: 0,0,1,2
2014,2409.14,1203.45,3412.12
2015,2941.01,3441.62,3491.16
2016,3496.83,3007.83,3457.19
2017,3119.55,3619.53,1963.1


In [13]:
cities = ['Kisumu', 'Mombasa', 'Nairobi']
s_df.columns = cities
s_df

Unnamed: 0,Kisumu,Mombasa,Nairobi
2014,2409.14,1203.45,3412.12
2015,2941.01,3441.62,3491.16
2016,3496.83,3007.83,3457.19
2017,3119.55,3619.53,1963.1


In [14]:
print(type(s_df))

<class 'pandas.core.frame.DataFrame'>


### DataFrames from dictionaries

In [15]:
cities = {"City":  ["London", "Berlin", "Madrid", "Rome",
                    "Paris", "Vienna", "Bucharest", "Hamburg",
                    "Budapest", "Warsaw", "Barcelona",
                    "Munich", "Milan"],
          "population": [8615246, 3562166, 3165235, 2874038,
                         2273305, 1805681, 1803425, 1760433,
                         1754000, 1740119, 1602386, 1493900,
                         1350680],
          "country": ["England", "Germany", "Spain", "Italy",
                      "France", "Austria", "Romania",
                      "Germany", "Hungary", "Poland", "Spain",
                      "Germany", "Italy"] }

df = pd.DataFrame(cities)
df

Unnamed: 0,City,population,country
0,London,8615246,England
1,Berlin,3562166,Germany
2,Madrid,3165235,Spain
3,Rome,2874038,Italy
4,Paris,2273305,France
5,Vienna,1805681,Austria
6,Bucharest,1803425,Romania
7,Hamburg,1760433,Germany
8,Budapest,1754000,Hungary
9,Warsaw,1740119,Poland


### Retrieving the column names

In [16]:
df.columns

Index(['City', 'population', 'country'], dtype='object')

### Custome index

In [17]:
ordinals = ["first", "second", "third", "fourth",
            "fifth", "sixth", "seventh", "eigth",
            "ninth", "tenth", "eleventh", "twelvth",
            "thirteenth"]
city_df = pd.DataFrame(cities, index=ordinals)
city_df

Unnamed: 0,City,population,country
first,London,8615246,England
second,Berlin,3562166,Germany
third,Madrid,3165235,Spain
fourth,Rome,2874038,Italy
fifth,Paris,2273305,France
sixth,Vienna,1805681,Austria
seventh,Bucharest,1803425,Romania
eigth,Hamburg,1760433,Germany
ninth,Budapest,1754000,Hungary
tenth,Warsaw,1740119,Poland


### Rearranging the order of the columns

In [18]:
df = pd.DataFrame(cities, columns=['City', 'country', 'population'])
df

Unnamed: 0,City,country,population
0,London,England,8615246
1,Berlin,Germany,3562166
2,Madrid,Spain,3165235
3,Rome,Italy,2874038
4,Paris,France,2273305
5,Vienna,Austria,1805681
6,Bucharest,Romania,1803425
7,Hamburg,Germany,1760433
8,Budapest,Hungary,1754000
9,Warsaw,Poland,1740119


In [19]:
df.reindex(columns=['country', 'City', 'population'])

Unnamed: 0,country,City,population
0,England,London,8615246
1,Germany,Berlin,3562166
2,Spain,Madrid,3165235
3,Italy,Rome,2874038
4,France,Paris,2273305
5,Austria,Vienna,1805681
6,Romania,Bucharest,1803425
7,Germany,Hamburg,1760433
8,Hungary,Budapest,1754000
9,Poland,Warsaw,1740119


Use the df.rename() function to rename the various columns

### Existing columns as the index of the DataFrame

In [20]:
city_frame = pd.DataFrame(cities, columns=['City', 'population'], index=cities['country'])
city_frame

Unnamed: 0,City,population
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


In [21]:
#Alternatively
city_df1 = city_df.set_index('country')
city_df1
#If I set the optional parameter "inplace" to True, the DataFrame will be changed in place, i.e. no new object will be created:

Unnamed: 0_level_0,City,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


### Accessing rows via index values
Rows can be accessed using the locators loc and iloc

In [22]:
print(city_frame.loc['Germany'])

            City  population
Germany   Berlin     3562166
Germany  Hamburg     1760433
Germany   Munich     1493900


In [23]:
print(city_frame.loc[['Germany', 'France']])

            City  population
Germany   Berlin     3562166
Germany  Hamburg     1760433
Germany   Munich     1493900
France     Paris     2273305


I can also select DataFrame rows based on conditions

In [24]:
condition = city_frame.population > 2000000
condition

England     True
Germany     True
Spain       True
Italy       True
France      True
Austria    False
Romania    False
Germany    False
Hungary    False
Poland     False
Spain      False
Germany    False
Italy      False
Name: population, dtype: bool

In [25]:
print(city_frame.loc[condition])

           City  population
England  London     8615246
Germany  Berlin     3562166
Spain    Madrid     3165235
Italy      Rome     2874038
France    Paris     2273305


It is logically possible to combine more than one condition with & and |

In [26]:
condition1 = (city_frame.population > 2000000)
condition2 = (city_frame['City'].str.contains('r'))
print(city_frame.loc[condition1 & condition2])

           City  population
Germany  Berlin     3562166
Spain    Madrid     3165235
France    Paris     2273305


### Adding rows to a DataFrame

In [28]:
print(city_frame)
milan = ['Milan', 1399860]
city_frame.iloc[-1] = milan
city_frame


              City  population
England     London     8615246
Germany     Berlin     3562166
Spain       Madrid     3165235
Italy         Rome     2874038
France       Paris     2273305
Austria     Vienna     1805681
Romania  Bucharest     1803425
Germany    Hamburg     1760433
Hungary   Budapest     1754000
Poland      Warsaw     1740119
Spain    Barcelona     1602386
Germany     Munich     1493900
Italy        Milan     1350680


Unnamed: 0,City,population
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


In [29]:
city_frame.loc['Switzerland'] = ['Zurich', 415215]
city_frame

Unnamed: 0,City,population
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


### Accessing rows by position

In [32]:
print(city_frame.iloc[-1])

City          Zurich
population    415215
Name: Switzerland, dtype: object


In [33]:
print(city_frame.iloc[[3, 6, 3, 0]])

              City  population
Italy         Rome     2874038
Romania  Bucharest     1803425
Italy         Rome     2874038
England     London     8615246


### Sum and Cumulative sum
#### Sum

In [38]:
import random
import numpy as np
#creating a new DataFrame object


years = range(2014, 2019)
cities = ["Zürich", "Freiburg", "München", "Konstanz", "Saarbrücken"]

shops = pd.DataFrame(index=years)
for city in cities:
    shops.insert(loc=len(shops.columns),
                 column = city,
                 value=(np.random.uniform(0.7, 1, (5,)) * 1000).round(2)
                )
shops

Unnamed: 0,Zürich,Freiburg,München,Konstanz,Saarbrücken
2014,936.39,868.53,738.7,940.31,958.34
2015,725.07,963.73,912.95,862.21,768.29
2016,891.61,775.21,783.97,850.54,982.71
2017,784.27,954.23,901.16,893.2,768.7
2018,858.83,809.93,947.42,743.89,914.84


In [40]:
shops.sum()

Zürich         4196.17
Freiburg       4371.63
München        4284.20
Konstanz       4290.15
Saarbrücken    4392.88
dtype: float64

In [41]:
shops.sum(axis=1)

2014    4442.27
2015    4232.25
2016    4284.04
2017    4301.56
2018    4274.91
dtype: float64

In [46]:
s = shops.iloc[:, [0, 2, -1]]
print(s)
s.sum()

      Zürich  München  Saarbrücken
2014  936.39   738.70       958.34
2015  725.07   912.95       768.29
2016  891.61   783.97       982.71
2017  784.27   901.16       768.70
2018  858.83   947.42       914.84


Zürich         4196.17
München        4284.20
Saarbrücken    4392.88
dtype: float64

In [48]:
#Alternatively
shops[['Zürich', 'München', 'Saarbrücken']].sum()

Zürich         4196.17
München        4284.20
Saarbrücken    4392.88
dtype: float64

#### Cumulative sum

In [49]:
x = shops.cumsum()
x

Unnamed: 0,Zürich,Freiburg,München,Konstanz,Saarbrücken
2014,936.39,868.53,738.7,940.31,958.34
2015,1661.46,1832.26,1651.65,1802.52,1726.63
2016,2553.07,2607.47,2435.62,2653.06,2709.34
2017,3337.34,3561.7,3336.78,3546.26,3478.04
2018,4196.17,4371.63,4284.2,4290.15,4392.88


Using the keyword parameter axis with the value 1, I can build the cumulative sum over the rows:

In [52]:
y = shops.cumsum(axis=1)
y

Unnamed: 0,Zürich,Freiburg,München,Konstanz,Saarbrücken
2014,936.39,1804.92,2543.62,3483.93,4442.27
2015,725.07,1688.8,2601.75,3463.96,4232.25
2016,891.61,1666.82,2450.79,3301.33,4284.04
2017,784.27,1738.5,2639.66,3532.86,4301.56
2018,858.83,1668.76,2616.18,3360.07,4274.91


### Assigning new values to columns

In [56]:
city_frame['population']

England        8615246
Germany        3562166
Spain          3165235
Italy          2874038
France         2273305
Austria        1805681
Romania        1803425
Germany        1760433
Hungary        1754000
Poland         1740119
Spain          1602386
Germany        1493900
Italy          1399860
Switzerland     415215
Name: population, dtype: int64

In [1]:
cities = {"City":  ["London", "Berlin", "Madrid", "Rome",
                    "Paris", "Vienna", "Bucharest", "Hamburg",
                    "Budapest", "Warsaw", "Barcelona",
                    "Munich", "Milan"],
          "population": [8615246, 3562166, 3165235, 2874038,
                         2273305, 1805681, 1803425, 1760433,
                         1754000, 1740119, 1602386, 1493900,
                         1350680],
          "country": ["England", "Germany", "Spain", "Italy",
                      "France", "Austria", "Romania",
                      "Germany", "Hungary", "Poland", "Spain",
                      "Germany", "Italy"] }

In [2]:
import pandas as pd
c_frame = pd.DataFrame(cities, columns=['country', 'population', 'cum_population'], index=cities['City'])
c_frame

Unnamed: 0,country,population,cum_population
London,England,8615246,
Berlin,Germany,3562166,
Madrid,Spain,3165235,
Rome,Italy,2874038,
Paris,France,2273305,
Vienna,Austria,1805681,
Bucharest,Romania,1803425,
Hamburg,Germany,1760433,
Budapest,Hungary,1754000,
Warsaw,Poland,1740119,


In [3]:
c_frame['cum_population'] = c_frame['population'].cumsum()
c_frame

Unnamed: 0,country,population,cum_population
London,England,8615246,8615246
Berlin,Germany,3562166,12177412
Madrid,Spain,3165235,15342647
Rome,Italy,2874038,18216685
Paris,France,2273305,20489990
Vienna,Austria,1805681,22295671
Bucharest,Romania,1803425,24099096
Hamburg,Germany,1760433,25859529
Budapest,Hungary,1754000,27613529
Warsaw,Poland,1740119,29353648


### Accessing the columns of a DataFrame
#### In a dictionary like way

In [4]:
c_frame['population']

London       8615246
Berlin       3562166
Madrid       3165235
Rome         2874038
Paris        2273305
Vienna       1805681
Bucharest    1803425
Hamburg      1760433
Budapest     1754000
Warsaw       1740119
Barcelona    1602386
Munich       1493900
Milan        1350680
Name: population, dtype: int64

#### As an attribute

In [5]:
c_frame.population

London       8615246
Berlin       3562166
Madrid       3165235
Rome         2874038
Paris        2273305
Vienna       1805681
Bucharest    1803425
Hamburg      1760433
Budapest     1754000
Warsaw       1740119
Barcelona    1602386
Munich       1493900
Milan        1350680
Name: population, dtype: int64

### Assigning new values to a column

In [6]:
city_frame = pd.DataFrame(cities, columns=["country", "area", "population"], index=cities["City"])
city_frame

Unnamed: 0,country,area,population
London,England,,8615246
Berlin,Germany,,3562166
Madrid,Spain,,3165235
Rome,Italy,,2874038
Paris,France,,2273305
Vienna,Austria,,1805681
Bucharest,Romania,,1803425
Hamburg,Germany,,1760433
Budapest,Hungary,,1754000
Warsaw,Poland,,1740119


In [7]:
city_frame.area =1572
city_frame

Unnamed: 0,country,area,population
London,England,1572,8615246
Berlin,Germany,1572,3562166
Madrid,Spain,1572,3165235
Rome,Italy,1572,2874038
Paris,France,1572,2273305
Vienna,Austria,1572,1805681
Bucharest,Romania,1572,1803425
Hamburg,Germany,1572,1760433
Budapest,Hungary,1572,1754000
Warsaw,Poland,1572,1740119


In [8]:
#area in square km
area = [1572, 891.85, 605.77, 1285, 105.4, 414.6,
        228, 755, 525.2, 517, 101.9, 310.4, 181.8] 

city_frame.area = area
city_frame

Unnamed: 0,country,area,population
London,England,1572.0,8615246
Berlin,Germany,891.85,3562166
Madrid,Spain,605.77,3165235
Rome,Italy,1285.0,2874038
Paris,France,105.4,2273305
Vienna,Austria,414.6,1805681
Bucharest,Romania,228.0,1803425
Hamburg,Germany,755.0,1760433
Budapest,Hungary,525.2,1754000
Warsaw,Poland,517.0,1740119


### Sorting DataFrames

In [9]:
city_frame = city_frame.sort_values(by='area', ascending=False)
city_frame

Unnamed: 0,country,area,population
London,England,1572.0,8615246
Rome,Italy,1285.0,2874038
Berlin,Germany,891.85,3562166
Hamburg,Germany,755.0,1760433
Madrid,Spain,605.77,3165235
Budapest,Hungary,525.2,1754000
Warsaw,Poland,517.0,1740119
Vienna,Austria,414.6,1805681
Munich,Germany,310.4,1493900
Bucharest,Romania,228.0,1803425


In [13]:
frame = pd.DataFrame(cities, columns=["country", "area", "population"], index=cities["City"])

some_areas = pd.Series([1572, 755, 181.8], index=['London', 'Hamburg', 'Milan'])
frame['area'] = some_areas
frame

Unnamed: 0,country,area,population
London,England,1572.0,8615246
Berlin,Germany,,3562166
Madrid,Spain,,3165235
Rome,Italy,,2874038
Paris,France,,2273305
Vienna,Austria,,1805681
Bucharest,Romania,,1803425
Hamburg,Germany,755.0,1760433
Budapest,Hungary,,1754000
Warsaw,Poland,,1740119


### Inserting new columns into existing DataFrames

In [16]:
city_frame = pd.DataFrame(cities, columns=["country", "population"], index=cities["City"])
area = [1572, 891.85, 605.77, 1285, 105.4, 414.6,
        228, 755, 525.2, 517, 101.9, 310.4, 181.8]
city_frame.insert(1, column='area', value=area)
city_frame

Unnamed: 0,country,area,population
London,England,1572.0,8615246
Berlin,Germany,891.85,3562166
Madrid,Spain,605.77,3165235
Rome,Italy,1285.0,2874038
Paris,France,105.4,2273305
Vienna,Austria,414.6,1805681
Bucharest,Romania,228.0,1803425
Hamburg,Germany,755.0,1760433
Budapest,Hungary,525.2,1754000
Warsaw,Poland,517.0,1740119


### Creating a DataFrame by appending rows

In [22]:
import pandas as pd
from numpy.random import randint

df =pd.DataFrame(columns=['lib', 'qty1', 'qty2'])
for i in range(10):
    df.loc[i] = ['name' + str(i)] + list(randint(10, size=2))
    
df

Unnamed: 0,lib,qty1,qty2
0,name0,6,4
1,name1,2,3
2,name2,0,1
3,name3,4,1
4,name4,6,9
5,name5,9,4
6,name6,5,4
7,name7,6,5
8,name8,8,7
9,name9,0,1


### DataFrame from a nested dictionary

In [25]:
growth = {"Switzerland": {"2010": 3.0, "2011": 1.8, "2012": 1.1,"2013": 1.9},
          "Germany": {"2010": 4.1, "2011": 3.6, "2012": 0.4, "2013": 0.1},
          "France": {"2010":2.0, "2011":2.1, "2012": 0.3, "2013": 0.3},
          "Greece": {"2010":-5.4, "2011":-8.9, "2012":-6.6, "2013": -3.3},
          "Italy": {"2010":1.7, "2011": 0.6, "2012":-2.3,"2013":-1.9}  }

df = pd.DataFrame(growth)
df

Unnamed: 0,Switzerland,Germany,France,Greece,Italy
2010,3.0,4.1,2.0,-5.4,1.7
2011,1.8,3.6,2.1,-8.9,0.6
2012,1.1,0.4,0.3,-6.6,-2.3
2013,1.9,0.1,0.3,-3.3,-1.9


In [28]:
df1 = df.T
df1

Unnamed: 0,2010,2011,2012,2013
Switzerland,3.0,1.8,1.1,1.9
Germany,4.1,3.6,0.4,0.1
France,2.0,2.1,0.3,0.3
Greece,-5.4,-8.9,-6.6,-3.3
Italy,1.7,0.6,-2.3,-1.9


In [30]:
df2 = df1.reindex(['Switzerland', 'Italy', 'Germany', 'France', 'Greece'])
df2

Unnamed: 0,2010,2011,2012,2013
Switzerland,3.0,1.8,1.1,1.9
Italy,1.7,0.6,-2.3,-1.9
Germany,4.1,3.6,0.4,0.1
France,2.0,2.1,0.3,0.3
Greece,-5.4,-8.9,-6.6,-3.3


## Accessing and changing values of DataFrames

In [32]:
import pandas as pd

first = ('Mike', 'Dorothee', 'Tom', 'Bill', 'Pete', 'Kate')
last = ('Meyer', 'Maier', 'Meyer', 'Mayer', 'Meyr', 'Mair')

job = ('data analyst', 'programmer', 'computer scientist', 
       'data scientist', 'accountant', 'psychiatrist')

language = ('Python', 'Perl', 'Java', 'Java', 'Cobol', 'Brainfuck')

x = list(zip(last, job, language))
df = pd.DataFrame(x, columns=['last', 'job', 'language'], index=first)
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,programmer,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Java
Pete,Meyr,accountant,Cobol
Kate,Mair,psychiatrist,Brainfuck


### Changing one value in the DataFrame
Pandas provides two ways loc and at to access and change the values of a DataFrame

In [35]:
#Accessing the job of bill
print(df.loc['Bill',  'job'])

#Alternative way to access it with at
print(df.at['Bill', 'job'])

data scientist
data scientist


In [37]:
#setting the job of Bill with loc
df.loc['Bill', 'job'] = 'data analyst'
print(df.loc['Bill', 'job'])

data analyst


In [38]:
#setting the language of 'Bill' to python using at
df.at['Bill', 'language'] = 'python'
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,programmer,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data analyst,python
Pete,Meyr,accountant,Cobol
Kate,Mair,psychiatrist,Brainfuck


I should use at if I only need to get or set a single value in a DataFrame or Series and loc if I want to access a single value but also to access a group of rows and columns by a label or labels.

### Replace

In [44]:
s = pd.Series([1, 2, 3, 4, 5])
print(s.replace(1, 13))
print(s)

0    13
1     2
2     3
3     4
4     5
dtype: int64
0    1
1    2
2    3
3    4
4    5
dtype: int64


In [45]:
s = pd.Series([1, 2, 3, 4, 5])
s.replace(1, 13, inplace=True)
s

0    13
1     2
2     3
3     4
4     5
dtype: int64

I can also change multiple values into a single value

In [47]:
s = pd.Series([0, 1, 2, 3, 4])
s.replace([0, 1, 2], 13, inplace=True)
s

0    13
1    13
2    13
3     3
4     4
dtype: int64

In [52]:
import pandas as pd

first = ('Mike', 'Dorothee', 'Tom', 'Bill', 'Pete', 'Kate')
last = ('Meyer', 'Maier', 'Meyer', 'Mayer', 'Meyr', 'Mair')
job = ('data analyst', 'programmer', 'computer scientist',
       'data scientist', 'programmer', 'psychiatrist')
language = ('Python', 'Perl', 'Java', 'Pithon', 'Pythen', 'Brainfuck')

x = list(zip(last, job, language))
df = pd.DataFrame(x, columns=['last', 'job', 'language'], index=first)
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,programmer,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,programmer,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [53]:
df.replace('programmer', 'computer scientist', inplace=True)
df

Unnamed: 0,last,job,language
Mike,Meyer,data analyst,Python
Dorothee,Maier,computer scientist,Perl
Tom,Meyer,computer scientist,Java
Bill,Mayer,data scientist,Pithon
Pete,Meyr,computer scientist,Pythen
Kate,Mair,psychiatrist,Brainfuck


In [55]:
x = list(zip(first, last, job, language))
df = pd.DataFrame(x, columns=['first', 'last', 'job', 'language'])
df

Unnamed: 0,first,last,job,language
0,Mike,Meyer,data analyst,Python
1,Dorothee,Maier,programmer,Perl
2,Tom,Meyer,computer scientist,Java
3,Bill,Mayer,data scientist,Pithon
4,Pete,Meyr,programmer,Pythen
5,Kate,Mair,psychiatrist,Brainfuck


In [56]:
df.replace(['Mike', 'Tom', 'Perl'], ['Michael', 'Thomas', 'Python'], inplace=True)
df

Unnamed: 0,first,last,job,language
0,Michael,Meyer,data analyst,Python
1,Dorothee,Maier,programmer,Python
2,Thomas,Meyer,computer scientist,Java
3,Bill,Mayer,data scientist,Pithon
4,Pete,Meyr,programmer,Pythen
5,Kate,Mair,psychiatrist,Brainfuck


### Using regular expressions

In [57]:
df.replace(to_replace=[r'M[ea][iy]e?r', r'P[iy]th[eo]n'], value=['Mayer', 'Python'], regex=True, inplace=True)
df

Unnamed: 0,first,last,job,language
0,Michael,Mayer,data analyst,Python
1,Dorothee,Mayer,programmer,Python
2,Thomas,Mayer,computer scientist,Java
3,Bill,Mayer,data scientist,Python
4,Pete,Mayer,programmer,Python
5,Kate,Mayer,psychiatrist,Brainfuck


#### To_replace can be a dictionary

In [60]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
                   'B': ['foo', 'bar', 'bloo', 'blee', 'bloo'],
                   'C': ['green', 'red', 'blue', 'yellow', 'green']})
df

Unnamed: 0,A,B,C
0,0,foo,green
1,1,bar,red
2,2,bloo,blue
3,3,blee,yellow
4,4,bloo,green


In [63]:
df.replace(to_replace={'A':{0:42, 3:33}, 'B':{'bloo':'vloo'}}, inplace=True)
df

Unnamed: 0,A,B,C
0,42,foo,green
1,1,bar,red
2,2,vloo,blue
3,33,blee,yellow
4,4,vloo,green


In [78]:
df.replace({'B':'bar'}, 'bor')

Unnamed: 0,A,B,C
0,42,foo,green
1,1,bor,red
2,2,vloo,blue
3,33,blee,yellow
4,4,vloo,green


### The method parameter of replace

In [79]:
df = pd.DataFrame({
                   'name':['Ben', 'Kate', 'Agnes', 'Ashleigh', 'Tom'],
                   'job':['programmer', 'NN', 'NN', 'engineer', 'teacher'],
                   'language':['Java', 'Python', 'LN', 'LN', 'C'] })
df

Unnamed: 0,name,job,language
0,Ben,programmer,Java
1,Kate,NN,Python
2,Agnes,NN,LN
3,Ashleigh,engineer,LN
4,Tom,teacher,C


In [81]:
#ffill method
df.replace(to_replace='NN', value=None, method='ffill')


Unnamed: 0,name,job,language
0,Ben,programmer,Java
1,Kate,programmer,Python
2,Agnes,programmer,LN
3,Ashleigh,engineer,LN
4,Tom,teacher,C


In [83]:
#I can use a list or a tuple in to_replace
df.replace(to_replace=['NN', 'LN'], method='ffill')

Unnamed: 0,name,job,language
0,Ben,programmer,Java
1,Kate,programmer,Python
2,Agnes,programmer,Python
3,Ashleigh,engineer,Python
4,Tom,teacher,C


In [84]:
#bfill method

df.replace(['NN', 'LN'], value=None, method='bfill')

Unnamed: 0,name,job,language
0,Ben,programmer,Java
1,Kate,engineer,Python
2,Agnes,engineer,C
3,Ashleigh,engineer,C
4,Tom,teacher,C
