# Python for Data Analysis - Workbook 2 (Pandas Basics)

### Preliminaries

In [2]:
import numpy as np
import pandas as pd

## Basic Data Structures

### Series

In [4]:
# A Series is a one-dimensional array-like object containing a sequence of values, 
# and an associated array called an index

obj = pd.Series([5,-2,13,4])
obj

0     5
1    -2
2    13
3     4
dtype: int64

In [5]:
# We can set the index by passing in an additional arg:

obj = pd.Series([5,-2,13,4], index = ['a', 'c', 'd', 'b'])
obj

a     5
c    -2
d    13
b     4
dtype: int64

In [10]:
# Can also create a Series directly from a Python dict

capitals = {'New York': 'Albany', 'Arizona': 'Phoenix', 'Montana': 'Helena'}
pd.Series(capitals)

# Can also change the index order by passing in the dict keys to the index arg in the order you want to appear
# Missing values will appear as NA

pd.Series(capitals, index = ['Montana', 'Arizona', 'New York', 'Texas'])

Montana      Helena
Arizona     Phoenix
New York     Albany
Texas           NaN
dtype: object

In [15]:
# Checking for missing data

capitals = {'New York': 'Albany', 'Arizona': 'Phoenix', 'Montana': 'Helena'}
pd.Series(capitals)
capitals2 = pd.Series(capitals, index = ['Montana', 'Arizona', 'New York', 'Texas'])

capitals2.isnull()
# pd.isnull(capitals2)
# capitals2.notnull()
# pd.notnull(capitals2)



Montana      True
Arizona      True
New York     True
Texas       False
dtype: bool

In [30]:
# Both the Series object and index have a name attribute which can be changed

a = {'Red': 29, 'Blue': 42, 'Green': 24}
obj = pd.Series(a)
obj.name = 'Numbers'
obj.index.name = 'Colors'
print(obj)
print(obj.index)

Colors
Red      29
Blue     42
Green    24
Name: Numbers, dtype: int64
Index(['Red', 'Blue', 'Green'], dtype='object', name='Colors')


In [34]:
# Can alter index in place by assignment

obj = pd.Series([29, 42, 24], index = ['Red', 'Blue', 'Green'])
obj.index = ['Purple', 'Yellow', 'Orange']
obj

Purple    29
Yellow    42
Orange    24
dtype: int64

### Dataframe

In [35]:
# Rectangular table of data and ordered collection of columns of different value types
# Can represent higher-dimensional data through hierarchical indexing

In [39]:
# One way to create is from a dict of equal-lengh lists or NumPy arrays

data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

pd.DataFrame(data)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [48]:
# Select first five (or any whatever number is passed in) rows of a DF

data = pd.DataFrame({
            'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
            'year': [2000, 2001, 2002, 2001, 2002, 2003],
            'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
        })

data.head()
# data.head(3)

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9


In [49]:
# Can specify columns in the order that you want

pd.DataFrame(data, columns = ['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [50]:
# Non-existent columns will just populate with NaN 

pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,
1,2001,Ohio,1.7,
2,2002,Ohio,3.6,
3,2001,Nevada,2.4,
4,2002,Nevada,2.9,
5,2003,Nevada,3.2,


In [54]:
# Can select with either dict-like or attribute-like access

data['year']
# data.year

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [58]:
# Retrieve rows with 'loc' attribute

data.loc[0]

state    Ohio
year     2000
pop       1.5
Name: 0, dtype: object

In [74]:
# Columns can be modified by assignment
# Can be either a scalar

frame2 = pd.DataFrame(data, columns = ['year', 'state', 'pop', 'debt'])

frame2['debt'] = 10
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,10
1,2001,Ohio,1.7,10
2,2002,Ohio,3.6,10
3,2001,Nevada,2.4,10
4,2002,Nevada,2.9,10
5,2003,Nevada,3.2,10


In [75]:
# or another array

frame2['debt'] = np.random.randint(0,20,6)
frame2

Unnamed: 0,year,state,pop,debt
0,2000,Ohio,1.5,3
1,2001,Ohio,1.7,1
2,2002,Ohio,3.6,3
3,2001,Nevada,2.4,6
4,2002,Nevada,2.9,3
5,2003,Nevada,3.2,15


In [76]:
# Can delete with the del method

del frame2['debt']
frame2

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [77]:
# Note that selecting a column on the underlying dataframe returns a view and not a copy. So can modify in place

print(frame2['pop'])
print('\n')
frame2['pop'] = 0
frame2['pop']

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64




0    0
1    0
2    0
3    0
4    0
5    0
Name: pop, dtype: int64

In [81]:
# Another way to create a df is to pass a nested dictionary
# Outer keys become columns and inner keys are rows
# Can transpose just like a numpy array

frame3 = pd.DataFrame({'Nevada': {'2002': 100, '2003': 150}, 'Ohio': {'2002': 500, '2003': 450}})
print(frame3)
frame3.T

      Nevada  Ohio
2002     100   500
2003     150   450


Unnamed: 0,2002,2003
Nevada,100,150
Ohio,500,450


#### Index Objects

In [4]:
# Responsible for holding axis labels and other metadata like axis name or names

obj = pd.Series(range(3), index=['a', 'b', 'c'])
index  = obj.index
index

Index(['a', 'b', 'c'], dtype='object')

In [16]:
# Note these are immutable, and behave similar to Python sets (though you can have dup values)
# But since they are like sets you can use similar methods

firstIndex = pd.Series(range(3), index = ['a', 'b', 'c']).index
secondIndex = pd.Series(range(3), index = ['c', 'd', 'e']).index

firstIndex.difference(secondIndex)
firstIndex.union(secondIndex)
firstIndex.intersection(secondIndex)
firstIndex.isin(['a'])

array([ True, False, False])

In [10]:
# Join to indices using append()

firstIndex = pd.Series(range(3), index = ['a', 'b', 'c']).index
secondIndex = pd.Series(range(3), index = ['d', 'e', 'f']).index

joined = firstIndex.append(secondIndex)
print(joined)

Index(['a', 'b', 'c', 'd', 'e', 'f'], dtype='object')


In [20]:
# Check if each index value is greater than previous

obj = pd.Series(range(3), index=['a', 'b', 'c'])
print(obj.is_monotonic)

# Check if each index value is unique
print(obj.is_unique)

True
True


In [31]:
# print all unique values from the index (e.g. create a set)

obj = pd.Series(range(4), index=['a', 'b', 'c', 'c'])

indexSet = set(obj.index.unique())
print(indexSet)

{'c', 'b', 'a'}


### Reindexing

In [47]:
# Reindexing creates a new object with the data conformed to the new index (missing values as NaN)

vals = np.random.randn(3)
idx = ['a', 'b', 'c']
obj = pd.Series(vals, idx)

newIdx = ['a', 'b', 'c', 'd']
newObj = obj.reindex(newIdx)
newObj

a   -0.175440
b   -0.942898
c   -1.163431
d         NaN
dtype: float64

In [70]:
# Several methods to interpolate, such as ffill (pulls forward from last) or bfill (grabs next and fills)

vals = np.random.randn(3)
idx = range(0,6,2)
obj = pd.Series(vals, idx)
obj

newIdx = range(6)
# newObj = obj.reindex(newIdx).bfill() # can also add number to set a limit of how far forward/back to fill
newObj = obj.reindex(newIdx).ffill()
print(newObj)
print('\n')
newObj2 = obj.reindex(index = newIdx, fill_value = 'x')
print(newObj2)

0    0.055842
1    0.055842
2   -0.653863
3   -0.653863
4   -2.673041
5   -2.673041
dtype: float64


0    0.0558415
1            x
2    -0.653863
3            x
4     -2.67304
5            x
dtype: object


In [64]:
# For a dataframe, reindex can alter rows, columns or both. 

frame = pd.DataFrame(np.arange(9).reshape(3, 3), index = ['a', 'b', 'c'], columns = ['Texas', 'California', 'New York'])

# Alter rows when only a sequence is passed

frame2 = frame.reindex(['a', 'b', 'f', 'g'])
print(frame2)
print('\n')

# Pass a 'columns' keyword to alter columns
states = ['Arizona', 'California', 'Montana']
frame3 = frame.reindex(columns=states)
print(frame3)

   Texas  California  New York
a    0.0         1.0       2.0
b    3.0         4.0       5.0
f    NaN         NaN       NaN
g    NaN         NaN       NaN


   Arizona  California  Montana
a      NaN           1      NaN
b      NaN           4      NaN
c      NaN           7      NaN


### Dropping entries

In [73]:
# can drop entries referencing the index/indices

vals = np.random.randn(5)
idx = ['a', 'b', 'c', 'd', 'e']
obj = pd.Series(vals, idx)
print(obj)

obj.drop(['a', 'b'])

a    3.477182
b   -0.652200
c    0.094297
d   -0.858969
e   -0.574794
dtype: float64


c    0.094297
d   -0.858969
e   -0.574794
dtype: float64

In [77]:
# For a dataframe, can specify the axis of to drop either rows (axis = 0) or columns (axis = 1)

data = pd.DataFrame(np.arange(16).reshape(4,4), 
                    index = ['Tennessee', 'New York', 'Minnesota', 'Utah'], 
                    columns = ['one', 'two', 'three', 'four'])
print(data)
data.drop(['three'], axis = 1)
data.drop(['Utah'], axis = 0)

# This method returns a copy, does not modify the original. To do that set 'inplace = True'

           one  two  three  four
Tennessee    0    1      2     3
New York     4    5      6     7
Minnesota    8    9     10    11
Utah        12   13     14    15


Unnamed: 0,one,two,three,four
Tennessee,0,1,2,3
New York,4,5,6,7
Minnesota,8,9,10,11


### Indexing, Selection, and Filtering

In [82]:
# Series indexing works just like with Numpy arrays

vals = np.arange(4)
idx = ['a', 'b', 'c', 'd']
obj = pd.Series(vals, idx)
print(obj)
obj['b']

a    0
b    1
c    2
d    3
dtype: int64


1

In [84]:
# Can select multiple by array index

obj[1:3]

b    1
c    2
dtype: int64

In [86]:
# Can also pass in a list of arbitrary index elements

obj[['b', 'a', 'd']]

b    1
a    0
d    3
dtype: int64

In [90]:
# Can also slice with labels, but note that endpoint is inclusive unlike regular Python slicing

obj['b':'c']

b    1
c    2
dtype: int64

In [88]:
# Filter by value

obj[obj > 1]

c    2
d    3
dtype: int64

In [92]:
# Can also modify in place

obj['b'] = 5
obj

a    0
b    5
c    2
d    3
dtype: int64

In [93]:
obj

a    0
b    5
c    2
d    3
dtype: int64

In [95]:
# Can select one or more columns by passing in a value or a sequence

data = pd.DataFrame(np.arange(16).reshape(4,4), 
                    index = ['Tennessee', 'New York', 'Minnesota', 'Utah'], 
                    columns = ['one', 'two', 'three', 'four'])
print(data)

# data['one']
data[['one', 'two']]

           one  two  three  four
Tennessee    0    1      2     3
New York     4    5      6     7
Minnesota    8    9     10    11
Utah        12   13     14    15


Unnamed: 0,one,two
Tennessee,0,1
New York,4,5
Minnesota,8,9
Utah,12,13


In [98]:
# Use the colon as syntax for row selection

data[:'New York']
# data[:2]

Unnamed: 0,one,two,three,four
Tennessee,0,1,2,3
New York,4,5,6,7


In [99]:
# Also easy to do boolean scalar comparisons

data > 5

Unnamed: 0,one,two,three,four
Tennessee,False,False,False,False
New York,False,False,True,True
Minnesota,True,True,True,True
Utah,True,True,True,True


#### Selection with loc and iloc

In [104]:
# Primary way to index based on rows (remember to use brackets not parentheses)

print(data)
data.loc['Tennessee', ['two', 'three']]

           one  two  three  four
Tennessee    0    1      2     3
New York     4    5      6     7
Minnesota    8    9     10    11
Utah        12   13     14    15


two      1
three    2
Name: Tennessee, dtype: int64

In [215]:
# Note that if indices are not unique, than .loc will select all matches

new = data.reindex(['Tennessee', 'Tennessee', 'Minnesota', 'Utah'])
new.loc['Tennessee']


Unnamed: 0,one,two,three,four
Tennessee,0,1,2,3
Tennessee,0,1,2,3


In [217]:
# Use the is_unique() method on the index to check if there are duplicates

print(new.columns.is_unique)
print(new.index.is_unique)

True
False


In [106]:
# use iloc for integer references

print(data)
data.iloc[-1, [3, 0, 1]] # selects Utah, columns "four", "one", and "two" in that order

           one  two  three  four
Tennessee    0    1      2     3
New York     4    5      6     7
Minnesota    8    9     10    11
Utah        12   13     14    15


four    15
one     12
two     13
Name: Utah, dtype: int64

In [110]:
# Can also slice

print(data.loc[:"Minnesota", 'two'])
data.iloc[:2,1:]

Tennessee    1
New York     5
Minnesota    9
Name: two, dtype: int64


Unnamed: 0,two,three,four
Tennessee,1,2,3
New York,5,6,7


In [115]:
# And can chain together with filtering

data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
New York,4,5,6
Minnesota,8,9,10
Utah,12,13,14


In [118]:
# can also select single scalar values with .at and .iat

print(data.at['Utah', 'one'])
print(data.iat[0,-1])

12
3


#### Arithmetic and Data Alignment

In [121]:
# When adding together objects, if any index pairs are not the same, the respective index in the result is the union
# aka automatic outer join

s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index = ['a', 'c', 'd', 'e'])
s2 = pd.Series([4.1, 21.2, 4, -1.5, 6.2], index = ['a', 'c', 'e', 'f', 'g'])

s1 + s2

a    11.4
c    18.7
d     NaN
e     5.5
f     NaN
g     NaN
dtype: float64

In [132]:
# Adding two data frames returns a new df where the index and columns are unions of the two constituent df's
# Adding two distinct df's will return all nulls

df1 = pd.DataFrame(np.arange(9).reshape(3,3), 
                   columns = list('bcd'), 
                   index = ['Ohio', 'Texas', 'Alabama'])
df2 = pd.DataFrame(np.arange(12).reshape(4,3), 
                   columns = list('bcd'), 
                   index = ['Colorado', 'Washington', 'Vermont', 'Texas'])

df1 + df2

Unnamed: 0,b,c,d
Alabama,,,
Colorado,,,
Ohio,,,
Texas,12.0,14.0,16.0
Vermont,,,
Washington,,,


In [147]:
# Can also pass in fill values for arithmetic operations

df1 = pd.DataFrame(np.arange(9).reshape(3,3), 
                   columns = list('bcd'), 
                   index = ['Ohio', 'Texas', 'Alabama'])
df2 = pd.DataFrame(np.arange(12).reshape(4,3), 
                   columns = list('bde'), 
                   index = ['Colorado', 'Washington', 'Vermont', 'Texas'])

print(df1 + df2)

# Note that the fill will happen BEFORE the computation
df1.add(df2, fill_value = 0)

               b   c     d   e
Alabama      NaN NaN   NaN NaN
Colorado     NaN NaN   NaN NaN
Ohio         NaN NaN   NaN NaN
Texas       12.0 NaN  15.0 NaN
Vermont      NaN NaN   NaN NaN
Washington   NaN NaN   NaN NaN


Unnamed: 0,b,c,d,e
Alabama,6.0,7.0,8.0,
Colorado,0.0,,1.0,2.0
Ohio,0.0,1.0,2.0,
Texas,12.0,4.0,15.0,11.0
Vermont,6.0,,7.0,8.0
Washington,3.0,,4.0,5.0


In [148]:
# Relatedly, you can specify a different fill value when reindexing the dataframe

df1.reindex(columns = df2.columns, fill_value = 0)

Unnamed: 0,b,d,e
Ohio,0,2,0
Texas,3,5,0
Alabama,6,8,0


In [156]:
# Arithmetic methods (note each has a counterpart, starting with 'r', that has the arguments inverted)

print(df1.sub(df2, fill_value = 0))
df1.rsub(df2, fill_value = 0)

# Other arithmetic methods: add, sub, mult, div, floordiv, pow

              b    c    d     e
Alabama     6.0  7.0  8.0   NaN
Colorado    0.0  NaN -1.0  -2.0
Ohio        0.0  1.0  2.0   NaN
Texas      -6.0  4.0 -5.0 -11.0
Vermont    -6.0  NaN -7.0  -8.0
Washington -3.0  NaN -4.0  -5.0


Unnamed: 0,b,c,d,e
Alabama,-6.0,-7.0,-8.0,
Colorado,0.0,,1.0,2.0
Ohio,0.0,-1.0,-2.0,
Texas,6.0,-4.0,5.0,11.0
Vermont,6.0,,7.0,8.0
Washington,3.0,,4.0,5.0


In [160]:
### Operations between DataFrame and Series

df1 = pd.DataFrame(np.arange(9).reshape(3,3), 
                   columns = list('bcd'), 
                   index = ['Ohio', 'Texas', 'Alabama'])

series = df1.iloc[0]

df1 - series

Unnamed: 0,b,c,d
Ohio,0,0,0
Texas,3,3,3
Alabama,6,6,6


In [163]:
# Note this broadcasts down the rows by default. To broadcast across columns, must use an arithmetic method

series2 = df1['d']
df1.sub(series2, axis = 'index')

Unnamed: 0,b,c,d
Ohio,-2,-1,0
Texas,-2,-1,0
Alabama,-2,-1,0


#### Function application and mapping

In [169]:
# Numpy ufuncs(element-wise array methods, e.g. abs()) also work with pandas objects

df = pd.DataFrame(np.random.randn(4,3), columns=list('bde'), index=['Utah', 'Gold', 'Sword', 'Juno'])
print(df)

np.abs(df)

              b         d         e
Utah   0.124046  0.315360  0.515655
Gold   0.364159  0.750515  0.189821
Sword -0.658553  0.279463  1.283074
Juno   1.062119  1.317335  0.595583


Unnamed: 0,b,d,e
Utah,0.124046,0.31536,0.515655
Gold,0.364159,0.750515,0.189821
Sword,0.658553,0.279463,1.283074
Juno,1.062119,1.317335,0.595583


In [174]:
# Can also use apply() to apply a function on a 1D array to each column or row

# Row-wise
f = lambda x: x.max() - x.min()
print(df.apply(f))

# Column-wise
df.apply(f, axis=1)

b    1.720672
d    1.037872
e    1.093253
dtype: float64


Utah     0.391609
Gold     0.560694
Sword    1.941627
Juno     0.721752
dtype: float64

In [177]:
# Can use map() on a Series for the same effect

form = lambda x: '%.2f' % x
df['e'].map(form)

Utah     0.52
Gold     0.19
Sword    1.28
Juno     0.60
Name: e, dtype: object

In [176]:
# Use applymap() to do the same on every value in the frame

df.applymap(form)

Unnamed: 0,b,d,e
Utah,0.12,0.32,0.52
Gold,0.36,0.75,0.19
Sword,-0.66,0.28,1.28
Juno,1.06,1.32,0.6


#### Sorting and Ranking

In [181]:
# Sort lexigraphically (by either column or row index) using the sort_index method

# Series
ser = pd.Series(range(4), index=list('cbad'))
print(ser)
ser.sort_index()

c    0
b    1
a    2
d    3
dtype: int64


a    2
b    1
c    0
d    3
dtype: int64

In [186]:
# Works for dataframe as well, just specify what axis you want

df = pd.DataFrame(np.random.randn(4,3), columns=list('bed'), index=['Utah', 'Gold', 'Sword', 'Juno'])
print(df)
print(df.sort_index())
print(df.sort_index(axis = 1))

              b         e         d
Utah  -1.270145 -0.581668  0.584007
Gold   0.589276  1.513388 -0.050338
Sword  0.332505 -0.257840  1.680649
Juno   0.420302  0.532282  0.464536
              b         e         d
Gold   0.589276  1.513388 -0.050338
Juno   0.420302  0.532282  0.464536
Sword  0.332505 -0.257840  1.680649
Utah  -1.270145 -0.581668  0.584007
              b         d         e
Utah  -1.270145  0.584007 -0.581668
Gold   0.589276 -0.050338  1.513388
Sword  0.332505  1.680649 -0.257840
Juno   0.420302  0.464536  0.532282


In [190]:
# to sort by value in a Series, use the sort_values() method. Missing values go to the end by default
# Set the 'ascending' flag depending on whether you want largest value on top or on bottom

obj = pd.Series([4, np.nan, 7, np.nan, -3, 0])
print(obj)
print(obj.sort_values())
print(obj.sort_values(ascending = False))

0    4.0
1    NaN
2    7.0
3    NaN
4   -3.0
5    0.0
dtype: float64
4   -3.0
5    0.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64
2    7.0
0    4.0
5    0.0
4   -3.0
1    NaN
3    NaN
dtype: float64


In [198]:
# For DF's, can pass in a column as a sorting key to sort the df by values in that column

df = pd.DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
print(df)
df.sort_values(by = 'b')

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1


Unnamed: 0,b,a
2,-3,0
3,2,1
0,4,0
1,7,1


In [199]:
# Can also sort by multiple columns, in order of priority

# Sort dataframe values by column 'b' THEN by column 'e' (doesn't apply here )
df.sort_values(by = ['a', 'b'])

Unnamed: 0,b,a
2,-3,0
0,4,0
3,2,1
1,7,1


##### Ranking

In [202]:
# rank() method assigns ranks (larger = higher rank) from 1 to n. Default tiebreaking assigns each group the mean rank

obj = pd.Series([7, 5, 3, -1, 7, 2, 3])
print(obj)
print(obj.rank())

# Descending

print(obj.rank(ascending = False))


0    7
1    5
2    3
3   -1
4    7
5    2
6    3
dtype: int64
0    6.5
1    5.0
2    3.5
3    1.0
4    6.5
5    2.0
6    3.5
dtype: float64
0    1.5
1    3.0
2    4.5
3    7.0
4    1.5
5    6.0
6    4.5
dtype: float64


In [208]:
# DataFrame can compute ranks over rows or columns

df = pd.DataFrame({'b':[4,7,-3,2], 'a':[0,1,0,1]})
print(df)

# Max: use maximum rank for whole group

print(df.rank(axis = 'columns', method = 'max'))

# Min, use minimum rank for whole group

print(df.rank(axis = 0, method = 'min'))


   b  a
0  4  0
1  7  1
2 -3  0
3  2  1
     b    a
0  2.0  1.0
1  2.0  1.0
2  1.0  2.0
3  2.0  1.0
     b    a
0  3.0  1.0
1  4.0  3.0
2  1.0  1.0
3  2.0  3.0


### Summarizing and Computing Descriptive Statistics

In [219]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], 
                  index = list('abcd'), columns = list('xy'))
print(df)

      x    y
a  1.40  NaN
b  7.10 -4.5
c   NaN  NaN
d  0.75 -1.3


In [222]:
# Sum

print(df.sum())
print(df.sum(axis=1))

x    9.25
y   -5.80
dtype: float64
a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64


In [224]:
# NA values are excluded unless the "skipna" flag is equal to false

print(df.mean(axis=1))
print(df.mean(axis=1, skipna= False))

a    1.400
b    1.300
c      NaN
d   -0.275
dtype: float64
a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64


In [227]:
# Describe is a good one that gives a quick snapshot

df.describe()

Unnamed: 0,x,y
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,1.075,-3.7
50%,1.4,-2.9
75%,4.25,-2.1
max,7.1,-1.3


In [243]:
# Can also do corr and covariance calculations which are pairwise comparisons
from datetime import datetime
df = pd.read_csv('./ethstats.csv')
df = df.tail(100)
df.drop('valueWei', axis= 1, inplace = True)
df.dropna(inplace = True)
df.set_index(keys = 'Date', drop = True)

Unnamed: 0_level_0,valueEth,gasUsed,avgGasPriceWei,avgGasPriceEth,numTxs,ethPrice,stateSizeBytes,stateSizeGB,totalAddresses,pctChange,erc20Txs,contractsVerified
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2020-03-11,587.177134,5.366733e+10,1.288431e+10,1.288431e-08,713065.0,194.86,4.160288e+12,4160.287848,89660329.0,0.001133,457920.0,61.0
2020-03-12,5112.962877,6.027016e+10,7.790324e+10,7.790324e-08,760781.0,109.78,4.165465e+12,4165.464881,89727271.0,0.000747,450402.0,19.0
2020-03-13,4904.498970,5.923705e+10,8.476182e+10,8.476182e-08,847947.0,135.44,4.169430e+12,4169.430233,89832970.0,0.001178,488233.0,44.0
2020-03-14,585.942871,5.420911e+10,1.666265e+10,1.666265e-08,681830.0,122.58,4.176079e+12,4176.079362,90050382.0,0.002420,452146.0,61.0
2020-03-15,523.336292,5.423278e+10,1.436498e+10,1.436498e-08,657653.0,123.53,4.182524e+12,4182.523606,90267393.0,0.002410,407092.0,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2020-06-13,1591.207928,6.104725e+10,2.937791e+10,2.937791e-08,833267.0,238.21,4.763512e+12,4763.512361,101100826.0,0.001255,471643.0,64.0
2020-06-14,1340.126565,6.176029e+10,2.423072e+10,2.423072e-08,785017.0,231.61,4.768182e+12,4768.182467,101212331.0,0.001103,463063.0,31.0
2020-06-15,2532.148961,6.106965e+10,4.325099e+10,4.325099e-08,912281.0,230.98,4.774629e+12,4774.629453,101324048.0,0.001104,514653.0,79.0
2020-06-16,2306.791403,6.094106e+10,4.053314e+10,4.053314e-08,904928.0,235.35,4.783004e+12,4783.003540,101433750.0,0.001083,514563.0,96.0


In [255]:
# Calculate correlation

df['gasUsed'].corr(df['ethPrice'])

0.08606276666826727

In [258]:
# Covariance is the same

df['erc20Txs'].cov(df['ethPrice'])

105585968198.54456

In [259]:
# Corrwith does pairwise comparisons with all other columns

df.corrwith(df.ethPrice)

valueEth             0.338146
gasUsed              0.790549
avgGasPriceWei       0.242400
avgGasPriceEth       0.242400
numTxs               0.746230
ethPrice             1.000000
stateSizeBytes       0.931633
stateSizeGB          0.931633
totalAddresses       0.933201
pctChange           -0.277589
erc20Txs             0.086063
contractsVerified    0.152045
dtype: float64