# Data structures
### Series

In [1]:
import pandas as pd

s = pd.Series([45, 34, 78, 65, 89])
print(s)

0    45
1    34
2    78
3    65
4    89
dtype: int64


I can directly access the index and the values of my series s

In [4]:
print(s.index)
print(s.values)

RangeIndex(start=0, stop=5, step=1)
[45 34 78 65 89]


Comparison with numpy arrays

In [9]:
import numpy as np

x = np.array([45, 34, 78, 65, 89])
print(x)
print(s.values)

print(type(s.values), type(x)) #The s values and x are of the same class, numpy.ndarry 

[45 34 78 65 89]
[45 34 78 65 89]
<class 'numpy.ndarray'> <class 'numpy.ndarray'>


In pandas, I can use arbitrary indices, this is shown in the cell below.

In [12]:
fruits = ['mangoes', 'apples', 'oranges', 'guavas']
quantities = [20, 10, 45, 32]
quantities1 = [17, 13, 31, 32]

x = pd.Series(quantities, index=fruits)
x1 = pd.Series(quantities1, index=fruits)
print(x + x1) #I'll get a new series with the same indices, the corresponding values will be added
print(f"Some of x = {sum(x)}")

mangoes    37
apples     23
oranges    76
guavas     64
dtype: int64
Some of x = 107


The indices does not have to be the same for series addition, if the index is not the same for both Series, the value of the series will be NaN 

In [15]:
f1 = ['peaches', 'apples', 'oranges', 'lemon']
f2 = ['mangoes', 'apples', 'oranges', 'lemon']
q1 = [32, 10, 5, 6]
q2 = [21, 54, 70, 3]

s1 = pd.Series(q1, index=f1)
s2 = pd.Series(q2, index=f2)
print(s1 + s2)

apples     64.0
lemon       9.0
mangoes     NaN
oranges    75.0
peaches     NaN
dtype: float64


### Indexing
It is possible to access single and multiple values of a Series object.

In [19]:
print(s1['apples'])
print(s2[['mangoes', 'apples', 'oranges']])

10
mangoes    21
apples     54
oranges    70
dtype: int64


Similar to numpy, I can use scalar operations and mathematical functions on Series object.

In [22]:
print((s1 + 3) * 2)
print(np.sin(s2))

peaches    70
apples     26
oranges    16
lemon      18
dtype: int64
mangoes    0.836656
apples    -0.558789
oranges    0.773891
lemon      0.141120
dtype: float64


### Pandas.Series.Apply

In [29]:
s1.apply(np.log) 

peaches    3.465736
apples     2.302585
oranges    1.609438
lemon      1.791759
dtype: float64

In [24]:
s2.apply(lambda x: x if x > 50 else x + 10)

mangoes    31
apples     54
oranges    70
lemon      13
dtype: int64

### Filtering with a boolean array

In [27]:
print(s1)
s1[s1<20]

peaches    32
apples     10
oranges     5
lemon       6
dtype: int64


apples     10
oranges     5
lemon       6
dtype: int64

In [30]:
"apples" in s1

True

### Creating Series objects from dictionaries

In [31]:
cities = {  "London": 8615246,
            "Berlin": 3562166,
            "Madrid": 3165235,
            "Rome": 2874038,
            "Paris": 2273305,
            "Vienna": 1805681,
            "Bucharest": 1803425,
            "Hamburg": 1760433,
            "Budapest": 1754000,
            "Warsaw": 1740119,
            "Barcelona": 1602386,
            "Munich": 1493900,
            "Milan": 1350680     }
y = pd.Series(cities)
print(y)

London       8615246
Berlin       3562166
Madrid       3165235
Rome         2874038
Paris        2273305
Vienna       1805681
Bucharest    1803425
Hamburg      1760433
Budapest     1754000
Warsaw       1740119
Barcelona    1602386
Munich       1493900
Milan        1350680
dtype: int64


### NaN - Missing data

In [32]:
my_cities = ["London", "Paris", "Zurich", "Berlin", "Stuttgart", "Hamburg"]

my_city_series = pd.Series(cities, index=my_cities)
my_city_series #due to the NaN values, the population values of the other cities are turned into float

London       8615246.0
Paris        2273305.0
Zurich             NaN
Berlin       3562166.0
Stuttgart          NaN
Hamburg      1760433.0
dtype: float64

### The methods Isnull() and Notnull()
I can check for the missing values with the methods isnull() and notnull()

In [33]:
print(my_city_series.isnull())

London       False
Paris        False
Zurich        True
Berlin       False
Stuttgart     True
Hamburg      False
dtype: bool


In [34]:
print(my_city_series.notnull())

London        True
Paris         True
Zurich       False
Berlin        True
Stuttgart    False
Hamburg       True
dtype: bool


### Connection between NaN and None
We also get NaN if the dictionary value is None

In [38]:
d = {'a':4, 'b':None, 'c':5, 'd':20}
s = pd.Series(d)
print(s)
pd.isnull(s)

a     4.0
b     NaN
c     5.0
d    20.0
dtype: float64


a    False
b     True
c    False
d    False
dtype: bool

### Filtering out missing data

In [39]:
print(my_city_series.dropna())

London     8615246.0
Paris      2273305.0
Berlin     3562166.0
Hamburg    1760433.0
dtype: float64


### Filling in missing data

In [40]:
print(my_city_series.fillna(0))

London       8615246.0
Paris        2273305.0
Zurich             0.0
Berlin       3562166.0
Stuttgart          0.0
Hamburg      1760433.0
dtype: float64


If I call fillna with a dict, I can provide the appropriate data, i.e. the population of Zurich and Stuttgart:

In [41]:
missing_cities = {"Stuttgart":597939, "Zurich":378884}
print(my_city_series.fillna(missing_cities))

London       8615246.0
Paris        2273305.0
Zurich        378884.0
Berlin       3562166.0
Stuttgart     597939.0
Hamburg      1760433.0
dtype: float64


In [42]:
my_city_series = my_city_series.fillna(missing_cities).astype(int)
print(my_city_series)

London       8615246
Paris        2273305
Zurich        378884
Berlin       3562166
Stuttgart     597939
Hamburg      1760433
dtype: int32


### DataFrame
Series can be arranged and concatenated into a DataFrame

In [44]:
years = range(2014, 2018)
s1 = pd.Series([2409.14, 2941.01, 3496.83, 3119.55], index=years)
s2 = pd.Series([1203.45, 3441.62, 3007.83, 3619.53], index=years)
s3 = pd.Series([3412.12, 3491.16, 3457.19, 1963.10], index=years)

s_df = pd.concat([s1, s2, s3], axis=1)
s_df

Unnamed: 0,0,1,2
2014,2409.14,1203.45,3412.12
2015,2941.01,3441.62,3491.16
2016,3496.83,3007.83,3457.19
2017,3119.55,3619.53,1963.1


In [45]:
cities = ['Kisumu', 'Mombasa', 'Nairobi']
s_df.columns = cities
s_df

Unnamed: 0,Kisumu,Mombasa,Nairobi
2014,2409.14,1203.45,3412.12
2015,2941.01,3441.62,3491.16
2016,3496.83,3007.83,3457.19
2017,3119.55,3619.53,1963.1


In [46]:
print(type(s_df))

<class 'pandas.core.frame.DataFrame'>


### DataFrames from dictionaries

In [54]:
cities = {"City":  ["London", "Berlin", "Madrid", "Rome",
                    "Paris", "Vienna", "Bucharest", "Hamburg",
                    "Budapest", "Warsaw", "Barcelona",
                    "Munich", "Milan"],
          "population": [8615246, 3562166, 3165235, 2874038,
                         2273305, 1805681, 1803425, 1760433,
                         1754000, 1740119, 1602386, 1493900,
                         1350680],
          "country": ["England", "Germany", "Spain", "Italy",
                      "France", "Austria", "Romania",
                      "Germany", "Hungary", "Poland", "Spain",
                      "Germany", "Italy"] }

df = pd.DataFrame(cities)
df

Unnamed: 0,City,population,country
0,London,8615246,England
1,Berlin,3562166,Germany
2,Madrid,3165235,Spain
3,Rome,2874038,Italy
4,Paris,2273305,France
5,Vienna,1805681,Austria
6,Bucharest,1803425,Romania
7,Hamburg,1760433,Germany
8,Budapest,1754000,Hungary
9,Warsaw,1740119,Poland


### Retrieving the column names

In [55]:
df.columns

Index(['City', 'population', 'country'], dtype='object')

### Custome index

In [56]:
ordinals = ["first", "second", "third", "fourth",
            "fifth", "sixth", "seventh", "eigth",
            "ninth", "tenth", "eleventh", "twelvth",
            "thirteenth"]
city_df = pd.DataFrame(cities, index=ordinals)
city_df

Unnamed: 0,City,population,country
first,London,8615246,England
second,Berlin,3562166,Germany
third,Madrid,3165235,Spain
fourth,Rome,2874038,Italy
fifth,Paris,2273305,France
sixth,Vienna,1805681,Austria
seventh,Bucharest,1803425,Romania
eigth,Hamburg,1760433,Germany
ninth,Budapest,1754000,Hungary
tenth,Warsaw,1740119,Poland


### Rearranging the order of the columns

In [59]:
df = pd.DataFrame(cities, columns=['City', 'country', 'population'])
df

Unnamed: 0,City,country,population
0,London,England,8615246
1,Berlin,Germany,3562166
2,Madrid,Spain,3165235
3,Rome,Italy,2874038
4,Paris,France,2273305
5,Vienna,Austria,1805681
6,Bucharest,Romania,1803425
7,Hamburg,Germany,1760433
8,Budapest,Hungary,1754000
9,Warsaw,Poland,1740119


In [68]:
df.reindex(columns=['country', 'City', 'population'])

Unnamed: 0,country,City,population
0,England,London,8615246
1,Germany,Berlin,3562166
2,Spain,Madrid,3165235
3,Italy,Rome,2874038
4,France,Paris,2273305
5,Austria,Vienna,1805681
6,Romania,Bucharest,1803425
7,Germany,Hamburg,1760433
8,Hungary,Budapest,1754000
9,Poland,Warsaw,1740119


Use the df.rename() function to rename the various columns

### Existing columns as the index of the DataFrame

In [65]:
city_frame = pd.DataFrame(cities, columns=['City', 'population'], index=cities['country'])
city_frame

Unnamed: 0,City,population
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


In [66]:
#Alternatively
city_df1 = city_df.set_index('country')
city_df1
#If I set the optional parameter "inplace" to True, the DataFrame will be changed in place, i.e. no new object will be created:

Unnamed: 0_level_0,City,population
country,Unnamed: 1_level_1,Unnamed: 2_level_1
England,London,8615246
Germany,Berlin,3562166
Spain,Madrid,3165235
Italy,Rome,2874038
France,Paris,2273305
Austria,Vienna,1805681
Romania,Bucharest,1803425
Germany,Hamburg,1760433
Hungary,Budapest,1754000
Poland,Warsaw,1740119


### Accessing rows via index values
Rows can be accessed using the locators loc and iloc

In [69]:
print(city_frame.loc['Germany'])

            City  population
Germany   Berlin     3562166
Germany  Hamburg     1760433
Germany   Munich     1493900


In [71]:
print(city_frame.loc[['Germany', 'France']])

            City  population
Germany   Berlin     3562166
Germany  Hamburg     1760433
Germany   Munich     1493900
France     Paris     2273305


I can also select DataFrame rows based on conditions

In [72]:
condition = city_frame.population > 2000000
condition

England     True
Germany     True
Spain       True
Italy       True
France      True
Austria    False
Romania    False
Germany    False
Hungary    False
Poland     False
Spain      False
Germany    False
Italy      False
Name: population, dtype: bool

In [73]:
print(city_frame.loc[condition])

           City  population
England  London     8615246
Germany  Berlin     3562166
Spain    Madrid     3165235
Italy      Rome     2874038
France    Paris     2273305
