In [2]:
import pandas as pd

In [3]:
#dataframe methods
#apply

dataframe = pd.DataFrame(data = {'age': [5,10,15], 'weight': [60,95,100]}, columns = ["age","weight"])

In [4]:
dataframe

Unnamed: 0,age,weight
0,5,60
1,10,95
2,15,100


## Basic Mapping 

In [43]:
## apply function is a dataframe method. It applies a function to each column
f = lambda x: x.max()
dataframe.apply(f)

0     5
1    10
2    15
dtype: int64

In [6]:
## apply map is an element wise function, it applys a function to each element in the dataframe
t = lambda x: x - 5
dataframe.applymap(t)

Unnamed: 0,age,weight
0,0,55
1,5,90
2,10,95


## Sorting

In [7]:
#here I am reindexing
dataframe_reindexed = dataframe.reindex([2,1,0])

In [8]:
dataframe_reindexed

Unnamed: 0,age,weight
2,15,100
1,10,95
0,5,60


In [9]:
# this method will sort by index
#by default sort index is ascending
dataframe_resorted = dataframe_reindexed.sort_index()

In [10]:
dataframe_resorted

Unnamed: 0,age,weight
0,5,60
1,10,95
2,15,100


In [11]:
#you can use sort by columns, it accepts multiple columns
#NaN are sorted to the end by default
dataframe_resorted.sort_values(by = ['age'], ascending = False)

Unnamed: 0,age,weight
2,15,100
1,10,95
0,5,60


## Descriptive statistics

In [12]:
## these methods extract a single value from a row or column
## excludes missing data

In [13]:
# can apply to the entire dataframe of the 
dataframe_resorted.sum()

age        30
weight    255
dtype: int64

In [28]:
#axis parameter lets you sum by column (0) or row (1)
dataframe_resorted.sum(axis = 1)

0     65
1    105
2    115
dtype: int64

In [14]:
# this summary stat is pretty much always good to do for each dataframe
dataframe_resorted.describe()

Unnamed: 0,age,weight
count,3.0,3.0
mean,10.0,85.0
std,5.0,21.794495
min,5.0,60.0
25%,7.5,77.5
50%,10.0,95.0
75%,12.5,97.5
max,15.0,100.0


## Correlation VS Covariance
* Covariance is the sum of the product of each of the distances from its mean. Intuition here being, when you have a positive covariance each sample, X tends to be above the mean when Y is also above the mean.
* Good tutorial: https://www.youtube.com/watch?v=ualmyZiPs9w
* Correlation is just normalized covariance. Correlation will be between -1 an 1.


In [15]:
dataframe_resorted['age'].cov(dataframe_resorted['weight'])

100.0

In [16]:
dataframe_resorted['age'].unique()

array([ 5, 10, 15])

In [17]:
dataframe_resorted['age'].isnull()

0    False
1    False
2    False
Name: age, dtype: bool

In [18]:
#you can use this method as a condition for filtering 
dataframe_resorted[dataframe_resorted['age'].isnull()]

Unnamed: 0,age,weight


In [22]:
#uniqueness
dataframe_resorted['age'].unique

<bound method Series.unique of 0     5
1    10
2    15
Name: age, dtype: int64>

In [30]:
#really easy way to get a list of counts by unique value, similarly there is another method called groupby we will go over later.
#parameter sort lets you sort the results in desc order
dataframe_resorted['age'].value_counts(sort=True)

15    1
10    1
5     1
Name: age, dtype: int64

# Filtering

In [31]:
# this is similar to filtering conditions mentioned previously, but this is great for cleaner data and you can pass the mask into mutiple objs.
mask = dataframe_resorted['age'].isin([15])
dataframe_resorted[mask]

Unnamed: 0,age,weight
2,15,100


In [39]:
#a great way to combine mapping and filtering
# evaluates true or false on each element in a specific column, then returns the entire rows where evaluated to be true
criterion = lambda x: x['age'] < 15
not_in = dataframe_resorted[dataframe_resorted.apply(criterion, axis=1)]
not_in

Unnamed: 0,age,weight
0,5,60
1,10,95


In [40]:
# what the inner apply returns
dataframe_resorted.apply(criterion, axis=1)

0     True
1     True
2    False
dtype: bool