# Operating on Data in Pandas

### Ufuncs: Index Preservation

In [1]:
import pandas as pd
import numpy as np

In [2]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
ser

0    6
1    3
2    7
3    4
dtype: int64

In [3]:
df = pd.DataFrame(rng.randint(0,10, (3,4)), columns = ['A', 'B', 'C','D'])
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [4]:
np.exp(ser)

0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64

In [5]:
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,-1.0,0.7071068,1.0,-1.0
1,-0.707107,1.224647e-16,0.707107,-0.7071068
2,-0.707107,1.0,-0.707107,1.224647e-16


In [6]:
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


In [7]:
df ** df

Unnamed: 0,A,B,C,D
0,46656,387420489,4,46656
1,823543,256,27,823543
2,823543,4,3125,256


In [8]:
df

Unnamed: 0,A,B,C,D
0,6,9,2,6
1,7,4,3,7
2,7,2,5,4


### Ufuncs : Index Alignment

In [10]:
# Here are two Series to work with

area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [18]:
# When you perform ufunc operations on two sets of data,
# if the indices don't ALL match pandas will automatically insert 'NaN' for the
# missing values

density = population / area
population

California    38332521
Texas         26448193
New York      19651127
Name: population, dtype: int64

In [20]:
# This is how ufuncs handles missing data in all cases.

A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [33]:
# You can use the object method too.
# You can use the 'fill_value' keyword to decide to apply the function with the 
# inserted value or for the default 'NaN' set to 'None'

print(A.add(B))
print(A.add(B, fill_value=None))
print(A.add(B, fill_value=0))
print(A.add(B, fill_value=5))
print(A.divide(B, fill_value=0))

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64
0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64
0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64
0     7.0
1     5.0
2     9.0
3    10.0
dtype: float64
0    inf
1    4.0
2    2.0
3    0.0
dtype: float64


### Index Alignment in DataFrame

In [35]:
# Ufuncs on DataFrames work similarly to the Series

A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A

Unnamed: 0,A,B
0,0,11
1,11,16


In [36]:
B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAC'))
B

Unnamed: 0,B,A,C
0,9,2,6
1,3,8,2
2,4,2,6


In [44]:
A + B

Unnamed: 0,A,B,C
0,2.0,20.0,
1,19.0,19.0,
2,,,


In [45]:
A

Unnamed: 0,A,B
0,0,11
1,11,16


In [54]:
print(A.stack().mean())   # < ------ Is Faster because only does the mean operation once
print(A.mean().mean())

9.5
9.5


In [55]:
fill = A.stack().mean()
A.add(B, fill_value=fill) # Uses the average value of the A dataframe for blank items

Unnamed: 0,A,B,C
0,2.0,20.0,15.5
1,19.0,19.0,11.5
2,11.5,13.5,15.5


In [60]:
'''
LIST OF OPERATORS AND EQUIVALENT PANDAS METHODS

Python Operator     Pandas Method(s)
+                   add()             
-                   sub(), subtract()       
*                   mul(), multiply()       
/                   truediv(), div(), divide() 
//                  floordiv()                 
%                   mod()                       
**                  pow()                     
''';

### Operations between DataFrames and Series

In [69]:
# DataFrame and Series operations operate similar to nparray operations
# Demonstrate using a dataframe and one of it's rows as a series
# You will see that the row is broadcast

A = rng.randint(10, size=(3,4))
A

array([[9, 3, 5, 1],
       [9, 1, 9, 3],
       [7, 6, 8, 7]])

In [71]:
A[0]

array([9, 3, 5, 1])

In [72]:
A - A[0]

array([[ 0,  0,  0,  0],
       [ 0, -2,  4,  2],
       [-2,  3,  3,  6]])

In [73]:
df = pd.DataFrame(A, columns = list('QRST'))

In [77]:
print(type(df), '\n')
print(df, '\n\n')
print(type(df.iloc[0]), '\n')
print(df.iloc[0], '\n\n')
print(df - df.iloc[0])

<class 'pandas.core.frame.DataFrame'> 

   Q  R  S  T
0  9  3  5  1
1  9  1  9  3
2  7  6  8  7 


<class 'pandas.core.series.Series'> 

Q    9
R    3
S    5
T    1
Name: 0, dtype: int64 


   Q  R  S  T
0  0  0  0  0
1  0 -2  4  2
2 -2  3  3  6


In [89]:
df.subtract(df['Q'], axis=0)

Unnamed: 0,Q,R,S,T
0,0,-6,-4,-8
1,0,-8,0,-6
2,0,-1,1,0


In [90]:
df.subtract(df['Q'])

Unnamed: 0,Q,R,S,T,0,1,2
0,,,,,,,
1,,,,,,,
2,,,,,,,


In [101]:
df.subtract(df.loc[0], axis=1)

Unnamed: 0,Q,R,S,T
0,0,0,0,0
1,0,-2,4,2
2,-2,3,3,6


In [103]:
df.subtract(df[1])

KeyError: 1

In [105]:
# Indices automatically align

halfrow = df.iloc[0, ::2]
halfrow

Q    9
S    5
Name: 0, dtype: int64

In [106]:
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,0.0,,4.0,
2,-2.0,,3.0,


In [107]:
df

Unnamed: 0,Q,R,S,T
0,9,3,5,1
1,9,1,9,3
2,7,6,8,7
