In [4]:
import numpy as np
import pandas as pd

pd.__version__

'0.21.1'

## Data Structures in Pandas

* Series: one-dimensional array of indexed data, has both index and values.
* DataFrame: either as a generalization of a np array or as a specialization of a py dict.
* Index

### Series vs. np array

The essential difference is the presence of the index. (implicit int vs. explicit)

You can think of Pandas `Series` a bit like a specialization of a Python dict. The type info of a series makes it much more efficient than py dicts.

In [5]:
data = pd.Series([0.25, 0.5, 0.75, 1.0])
data

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [7]:
print(data.index)
print(data[1])
print(data[1:3])

RangeIndex(start=0, stop=4, step=1)
0.5
1    0.50
2    0.75
dtype: float64


In [9]:
# index can be any type
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])
print(data)
print(data['a'])

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.25


In [10]:
# confusing index
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=[2, 5, 3, 7])

# '3' is the index value, not the ordinary array index
data[3]

0.75

In [11]:
# construct a series by dict explicitly, the 'keys' become 'index'
population_dict = {'Califonia': 123,
                   'Texas': 234,
                   'New York': 345,
                   'Florida': 456,
                   'Illinois': 567}
pop = pd.Series(population_dict)
pop

Califonia    123
Florida      456
Illinois     567
New York     345
Texas        234
dtype: int64

In [13]:
# suprising slicing
pop['Califonia': 'Illinois']

Califonia    123
Florida      456
Illinois     567
dtype: int64

### DataFrame as a Numpy array

You can think of a DataFrame as a sequence of aligned Series objects, which means they share the same index.

In [16]:
area_dict = {'Califonia': 321,
             'Texas': 432,
             'New York': 543,
             'Florida': 654,
             'Illinois': 765}

area = pd.Series(area_dict)

states = pd.DataFrame({'pop': pop, 'area': area})

# series are alighed
states

Unnamed: 0,area,pop
Califonia,321,123
Florida,654,456
Illinois,765,567
New York,543,345
Texas,432,234


### DataFrame as a specialization dict

A DataFrame maps a column name to a series of column data.

In [17]:
# get a series
states['area']

Califonia    321
Florida      654
Illinois     765
New York     543
Texas        432
Name: area, dtype: int64

### Constructing DataFrame objects

In [19]:
# from a single series
pd.DataFrame(pop, columns=['population'])

Unnamed: 0,population
Califonia,123
Florida,456
Illinois,567
New York,345
Texas,234


In [21]:
# from a list if dicts, with NaN filled
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [22]:
# from a dict of series
pd.DataFrame({'population': pop, 'area': area})

Unnamed: 0,area,population
Califonia,321,123
Florida,654,456
Illinois,765,567
New York,543,345
Texas,432,234


In [23]:
# from a 2D array
pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c'])

Unnamed: 0,foo,bar
a,0.538901,0.425459
b,0.867735,0.276374
c,0.003995,0.504655


# Combing Datasets: Concat and Append

* concat
* join
* merge


In [24]:
def make_df(cols, ind):
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    return pd.DataFrame(data, ind)

make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [25]:
df1 = make_df('AB', [1, 2])
df2 = make_df('AB', [3, 4])
print(df1); print(df2); 

# by default: axis = 0
print(pd.concat([df1, df2]))

    A   B
1  A1  B1
2  A2  B2
    A   B
3  A3  B3
4  A4  B4
    A   B
1  A1  B1
2  A2  B2
3  A3  B3
4  A4  B4


In [32]:
# duplicate indices
x = make_df('AB', [0, 1])
y = make_df('AB', [2, 3])
y.index = x.index
print(x); print(y);
print(pd.concat([x, y]))

    A   B
0  A0  B0
1  A1  B1
    A   B
0  A2  B2
1  A3  B3
    A   B
0  A0  B0
1  A1  B1
0  A2  B2
1  A3  B3


In [30]:
# catching error, or use ingore_index to ignore the duplicate indices.
try:
    pd.concat([x, y], verify_integrity=True)
except ValueError as e:
    print('ValueError: ', e)

ValueError:  Indexes have overlapping values: [0, 1]


In [33]:
# adding multi-index keys
print(pd.concat([x, y], keys=['x', 'y']))

      A   B
x 0  A0  B0
  1  A1  B1
y 0  A2  B2
  1  A3  B3


In [34]:
df5 = make_df('ABC', [1, 2])
df6 = make_df('BCD', [3, 4])
print(df5); print(df6); 

# by default: outer join
print(pd.concat([df5, df6]))

    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4
     A   B   C    D
1   A1  B1  C1  NaN
2   A2  B2  C2  NaN
3  NaN  B3  C3   D3
4  NaN  B4  C4   D4


In [35]:
# inner join
pd.concat([df5, df6], join='inner')

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


In [36]:
# remaining cols
pd.concat([df5, df6], join_axes=[df5.columns])

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2
3,,B3,C3
4,,B4,C4


## Merge and join

* pd.merge function

Note the **Relational Algebra**.