# Chapter 6

# pandas in Depth: Data Manipulation
- in the previous ch we saw how to acquire data from files and dbs but now we gotta do stuff to the data
    - 3 phases of doing stuff:
        1. data prep
        2. data transformation
        3. data aggregation
        
## Data Preparation
- Different procedures for data prep:
    1. Loading <--Covered by previous chapter (phew)
    2. Assembling
    3. Merging
    4. Concatenating
    5. Combining
    6. Reshaping (pivoting)
    7. Removing

## Merging

In [1]:
import numpy as np
import pandas as pd
frame1 = pd.DataFrame({'id':['ball','pencil','pen','mug','ashtray'],
                      'price':[12.33,11.44,33.21,13.23,33.62]})
frame1

Unnamed: 0,id,price
0,ball,12.33
1,pencil,11.44
2,pen,33.21
3,mug,13.23
4,ashtray,33.62


In [2]:
frame2 = pd.DataFrame({'id':['pencil','pencil','ball','pen'],
                      'color':['white','red','red','black']})
frame2

Unnamed: 0,id,color
0,pencil,white
1,pencil,red
2,ball,red
3,pen,black


In [3]:
pd.merge(frame1,frame2) #returns df with all rows that have ID in common

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [4]:
#can also define explicitly criteria for merging
pd.merge(frame1,frame2,on='id')

Unnamed: 0,id,price,color
0,ball,12.33,red
1,pencil,11.44,white
2,pencil,11.44,red
3,pen,33.21,black


In [5]:
frame1 = pd.DataFrame({'id':['ball','pencil','pen','mug','ashtray'],
                      'color':['white','red','red','black','green'],
                      'brand':['OMG','ABC','ABC','POD','POD']})
frame1

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD


In [6]:
frame2 = pd.DataFrame({'id':['pencil','pencil','ball','pen'],
                      'brand':['OMG','POD','ABC','POD']})
frame2

Unnamed: 0,id,brand
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [7]:
pd.merge(frame1,frame2) #no results :( since you have 2 dfs having cols with same name

Unnamed: 0,id,color,brand


In [8]:
pd.merge(frame1,frame2,on='id') #specify a key

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD


In [9]:
pd.merge(frame1,frame2,on='brand')

Unnamed: 0,id_x,color,brand,id_y
0,ball,white,OMG,pencil
1,pencil,red,ABC,ball
2,pen,red,ABC,ball
3,mug,black,POD,pencil
4,mug,black,POD,pen
5,ashtray,green,POD,pencil
6,ashtray,green,POD,pen


In [10]:
#results vary considerably depending on the criteria of merging. Often the opposite prob
#arrises when 2 dfs in which the key cols do not have the same name, to remedy this
# we have left_on right_on
frame2.columns = ['sid','brand']
frame2

Unnamed: 0,sid,brand
0,pencil,OMG
1,pencil,POD
2,ball,ABC
3,pen,POD


In [11]:
pd.merge(frame1,frame2, left_on='id',right_on='sid')

Unnamed: 0,id,color,brand_x,sid,brand_y
0,ball,white,OMG,ball,ABC
1,pencil,red,ABC,pencil,OMG
2,pencil,red,ABC,pencil,POD
3,pen,red,ABC,pen,POD


In [12]:
#by default, merge() performs an inner join (a result of intersection)
#can also perform a union or outer join using option how='outer'
frame2.columns =['id','brand']
pd.merge(frame1,frame2,on='id')


Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD


In [13]:
pd.merge(frame1,frame2,on='id',how='outer')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD
4,mug,black,POD,
5,ashtray,green,POD,


In [14]:
pd.merge(frame1,frame2,on='id',how='outer')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD
4,mug,black,POD,
5,ashtray,green,POD,


In [15]:
pd.merge(frame1,frame2,on='id',how='right')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD


In [16]:
pd.merge(frame1,frame2,on='id',how='left')

Unnamed: 0,id,color,brand_x,brand_y
0,ball,white,OMG,ABC
1,pencil,red,ABC,OMG
2,pencil,red,ABC,POD
3,pen,red,ABC,POD
4,mug,black,POD,
5,ashtray,green,POD,


In [17]:
pd.merge(frame1,frame2,on=['id','brand'],how='outer') #can also merge on multiple keys

Unnamed: 0,id,color,brand
0,ball,white,OMG
1,pencil,red,ABC
2,pen,red,ABC
3,mug,black,POD
4,ashtray,green,POD
5,pencil,,OMG
6,pencil,,POD
7,ball,,ABC
8,pen,,POD


# Merging on an Index

In [18]:
pd.merge(frame1,frame2,right_index=True,left_index=True) #can use index as merging keys

Unnamed: 0,id_x,color,brand_x,id_y,brand_y
0,ball,white,OMG,pencil,OMG
1,pencil,red,ABC,pencil,POD
2,pen,red,ABC,ball,ABC
3,mug,black,POD,pen,POD


In [19]:
#gotta use right/left_index option to decide which, or both
#df objects also have join function, combines many df objects with no col overlap
#frame1.join(frame2)
# get error code :
#columns overlap but no suffix specified: Index(['id', 'brand'], dtype='object')
#cause cols have the same name

In [20]:
#we can rename the cols in frame2 to fix the error
frame2.columns = ['brand2','id2']
frame1.join(frame2)

Unnamed: 0,id,color,brand,brand2,id2
0,ball,white,OMG,pencil,OMG
1,pencil,red,ABC,pencil,POD
2,pen,red,ABC,ball,ABC
3,mug,black,POD,pen,POD
4,ashtray,green,POD,,


## Concatenating

In [21]:
#numpy provides concatenate() function to do this with arrays
array1 = np.arange(9).reshape((3,3))
array1

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [22]:
array2 = np.arange(9).reshape((3,3))+6
array2

array([[ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [23]:
np.concatenate([array1,array2],axis=1)

array([[ 0,  1,  2,  6,  7,  8],
       [ 3,  4,  5,  9, 10, 11],
       [ 6,  7,  8, 12, 13, 14]])

In [24]:
np.concatenate([array1,array2],axis=0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [25]:
#with pandas we have series and dfs so it also has it's own native function concat()
ser1 = pd.Series(np.random.rand(4),index=[1,2,3,4])
ser1

1    0.857854
2    0.741259
3    0.038077
4    0.157025
dtype: float64

In [26]:
ser2 = pd.Series(np.random.rand(4),index=[5,6,7,8])
ser2

5    0.815134
6    0.175583
7    0.529707
8    0.298635
dtype: float64

In [27]:
pd.concat([ser1,ser2]) #by default this is axis=0

1    0.857854
2    0.741259
3    0.038077
4    0.157025
5    0.815134
6    0.175583
7    0.529707
8    0.298635
dtype: float64

In [28]:
pd.concat([ser1,ser2],axis=1)
#the problem is that this kind of operation doesn't show the concatenated parts

Unnamed: 0,0,1
1,0.857854,
2,0.741259,
3,0.038077,
4,0.157025,
5,,0.815134
6,,0.175583
7,,0.529707
8,,0.298635


In [29]:
#we can fix this using a hierarchical index on the axis of concatenation
pd.concat([ser1,ser2],keys=[1,2]) #gotta use the keys= option

1  1    0.857854
   2    0.741259
   3    0.038077
   4    0.157025
2  5    0.815134
   6    0.175583
   7    0.529707
   8    0.298635
dtype: float64

In [30]:
pd.concat([ser1,ser2], axis=1,keys=[1,2]) #keys become the headers

Unnamed: 0,1,2
1,0.857854,
2,0.741259,
3,0.038077,
4,0.157025,
5,,0.815134
6,,0.175583
7,,0.529707
8,,0.298635


In [31]:
#the same can be done for a df
frame1 = pd.DataFrame(np.random.rand(9).reshape(3,3),index=[1,2,3], columns=['A','B','C'])
frame1

Unnamed: 0,A,B,C
1,0.648273,0.041567,0.198767
2,0.383792,0.760389,0.397656
3,0.060801,0.140583,0.214622


In [32]:
frame2 = pd.DataFrame(np.random.rand(9).reshape(3,3),index=[4,5,6], columns=['A','B','C'])
frame2

Unnamed: 0,A,B,C
4,0.914895,0.011531,0.46504
5,0.49593,0.129858,0.329917
6,0.859721,0.486336,0.714107


In [33]:
pd.concat([frame1,frame2])

Unnamed: 0,A,B,C
1,0.648273,0.041567,0.198767
2,0.383792,0.760389,0.397656
3,0.060801,0.140583,0.214622
4,0.914895,0.011531,0.46504
5,0.49593,0.129858,0.329917
6,0.859721,0.486336,0.714107


In [34]:
pd.concat([frame1,frame2],axis=1)

Unnamed: 0,A,B,C,A.1,B.1,C.1
1,0.648273,0.041567,0.198767,,,
2,0.383792,0.760389,0.397656,,,
3,0.060801,0.140583,0.214622,,,
4,,,,0.914895,0.011531,0.46504
5,,,,0.49593,0.129858,0.329917
6,,,,0.859721,0.486336,0.714107


## Combining

In [35]:
ser1 = pd.Series(np.random.rand(5), index=[1,2,3,4,5])
ser2 = pd.Series(np.random.rand(4),index = [2,4,5,6])

In [36]:
ser1.combine_first(ser2) #does the operation with data alignment

1    0.348634
2    0.210657
3    0.185672
4    0.398143
5    0.941877
6    0.558514
dtype: float64

In [37]:
ser2.combine_first(ser1)

1    0.348634
2    0.886201
3    0.185672
4    0.926231
5    0.064702
6    0.558514
dtype: float64

In [38]:
#can even do partial overlaps
ser1[:3].combine_first(ser2[:3])

1    0.348634
2    0.210657
3    0.185672
4    0.926231
5    0.064702
dtype: float64

# Pivoting
 ## Pivoting with Hierarchical indexing
 - stacking: rotates or pivots the df converting cols to rows
 - unstacking: converts rows to cols

In [39]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                     index = ['white','black','orange'],
                     columns = ['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
orange,6,7,8


In [40]:
ser5 = frame1.stack()
ser5 #pivoting of cols in rows, thus producing a series

white   ball      0
        pen       1
        pencil    2
black   ball      3
        pen       4
        pencil    5
orange  ball      6
        pen       7
        pencil    8
dtype: int64

In [41]:
ser5.unstack() #reassemble df into pivoted table by use of unstack

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
orange,6,7,8


In [42]:
#can also do unstack on a diff lvl, specify the number of lvls or name
ser5.unstack(0)

Unnamed: 0,white,black,orange
ball,0,3,6
pen,1,4,7
pencil,2,5,8


# Pivoting from "Long" to "Wide" Format
 - most common way of storing datasets is by punctual sep data formats such as CSV, this happens esp from instrumental reading, calculation results iterated over time, or the simple manual input of a series of values
 - peculiar characteristic of this type of dataset is to have entries on various cols. often duplicated in subsequent lines, always remaining in tabular format you can call it __long__ or __stacked__ format

In [43]:
longframe = pd.DataFrame({'color':['white','white','white','red'\
                                   ,'red','red','black','black','black'],
                          'item':['ball','pen','mug','ball','pen',\
                                'mug','ball'\
                                ,'pen','mug'],
                         'value':np.random.rand(9)})
longframe

Unnamed: 0,color,item,value
0,white,ball,0.38676
1,white,pen,0.261374
2,white,mug,0.385627
3,red,ball,0.977787
4,red,pen,0.718571
5,red,mug,0.505554
6,black,ball,0.381807
7,black,pen,0.26309
8,black,mug,0.899456


In [44]:
#this method of data recording has some disadvantages like duplications, difficult to read
#can convert to wide by setting a primary key from a column(or set of col), then the
#values contained in them must be unique
wideframe = longframe.pivot('color','item')
wideframe

Unnamed: 0_level_0,value,value,value
item,ball,mug,pen
color,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
black,0.381807,0.899456,0.26309
red,0.977787,0.505554,0.718571
white,0.38676,0.385627,0.261374


## Removing

In [45]:
frame1 = pd.DataFrame(np.arange(9).reshape(3,3),
                     index = ['white','black','red'],
                     columns = ['ball','pen','pencil'])
frame1

Unnamed: 0,ball,pen,pencil
white,0,1,2
black,3,4,5
red,6,7,8


In [46]:
del frame1['ball']
frame1 #removes unwanted row

Unnamed: 0,pen,pencil
white,1,2
black,4,5
red,7,8


In [47]:
frame1.drop('white') #can also use this but must redefine to keep changes

Unnamed: 0,pen,pencil
black,4,5
red,7,8


# Data Transformation
- prepare for the second stage of data manipulation the _data transformation_, it's important to learn how to transform the values
    - next examples involve duplicate or invalid values w/ possible removal or replacement
    - also will have handling and processing numerical values of the data and strings

## Removing Duplicates

In [48]:
dframe = pd.DataFrame({'color':['white','white','red','red','white'],
                      'value':[2,1,3,3,2]})
dframe

Unnamed: 0,color,value
0,white,2
1,white,1
2,red,3
3,red,3
4,white,2


In [49]:
dframe.duplicated() #returns booleans

0    False
1    False
2    False
3     True
4     True
dtype: bool

In [50]:
dframe[dframe.duplicated()] #can select within using row indexing

Unnamed: 0,color,value
3,red,3
4,white,2


## Mapping
- mapping is the creation of a list of matches etween 2 different values, w/ the ability to bind a value to a particular label or string

In [51]:
map = {
    'label1':'value1',
    'label2':'value2',
}
map

{'label1': 'value1', 'label2': 'value2'}

In [52]:
#remember to reassign builtins if you fuck up and accidentally assign it as we did above
map =__builtins__.map
map

map

In [53]:
#or you can delete the global:
map = {
    'label1':'value1',
    'label2':'value2',
}
map

{'label1': 'value1', 'label2': 'value2'}

In [54]:
#as so:
del map

In [55]:
map

map

In [56]:
frame = pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],
                     'color':['white','rosso','verde','black','yellow'],
                     'price':[5.60,4,20,1,30]})
frame

Unnamed: 0,item,color,price
0,ball,white,5.6
1,mug,rosso,4.0
2,pen,verde,20.0
3,pencil,black,1.0
4,ashtray,yellow,30.0


In [57]:
#can replace incorrect values with new values, it is necessary to define the mapping
#of corrdinates containing the key values
newcolors = {
    'rosso':'red',
    'verde':'green'
}

In [58]:
frame.replace(newcolors)

Unnamed: 0,item,color,price
0,ball,white,5.6
1,mug,red,4.0
2,pen,green,20.0
3,pencil,black,1.0
4,ashtray,yellow,30.0


In [59]:
ser = pd.Series([1,3,np.nan,4,6,np.nan,3])
ser

0    1.0
1    3.0
2    NaN
3    4.0
4    6.0
5    NaN
6    3.0
dtype: float64

In [60]:
ser.replace(np.nan,0) #can also do a manual edit

0    1.0
1    3.0
2    0.0
3    4.0
4    6.0
5    0.0
6    3.0
dtype: float64

## Adding Values via Mapping

In [61]:
frame = pd.DataFrame({'item':['ball','mug','pen','pencil','ashtray'],
                     'color':['white','red','green','black','yellow']})
frame

Unnamed: 0,item,color
0,ball,white
1,mug,red
2,pen,green
3,pencil,black
4,ashtray,yellow


In [62]:
# suppose we want to add a col of prices, must predefine the dict to pass
prices = {
    'ball': 5.56,
    'mug': 4.20,
    'bottle':1.30,
    'scissors':3.41,
    'pen':1.20,
    'pencil':0.95,
    'ashtray':2.75
}

In [63]:
#map() will accept a function or an object containing a dict with mapping, can apply the
#mapping of prices on the cols item, making sure to add a col to the price df
frame['price'] = frame['item'].map(prices)
frame
#can't map to items in prices that don't exist

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.2
3,pencil,black,0.95
4,ashtray,yellow,2.75


## Rename the Indexes of the Axes
- low-key done the same way n' shit, nigga 

In [64]:
frame

Unnamed: 0,item,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.2
3,pencil,black,0.95
4,ashtray,yellow,2.75


In [65]:
reindex = {
    0:'first',
    1:'second',
    2:'third',
    3:'fourth',
    4:'fifth'
}

In [66]:
frame.rename(reindex)

Unnamed: 0,item,color,price
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.2
fourth,pencil,black,0.95
fifth,ashtray,yellow,2.75


In [67]:
#renames the indexes, if you wanna rename cols use columns option
#can do the same style of mapping
recolumn = {
    'item':'object',
    'price':'value'
}

In [68]:
frame.rename(index=reindex,columns=recolumn)

Unnamed: 0,object,color,value
first,ball,white,5.56
second,mug,red,4.2
third,pen,green,1.2
fourth,pencil,black,0.95
fifth,ashtray,yellow,2.75


In [69]:
frame.rename(index={1:'first'}, columns = {'item':'object'}) #single value replaced

Unnamed: 0,object,color,price
0,ball,white,5.56
first,mug,red,4.2
2,pen,green,1.2
3,pencil,black,0.95
4,ashtray,yellow,2.75


In [70]:
frame.rename(columns={'item':'object'}, inplace = True)
frame

Unnamed: 0,object,color,price
0,ball,white,5.56
1,mug,red,4.2
2,pen,green,1.2
3,pencil,black,0.95
4,ashtray,yellow,2.75


## Decretization and Binning
- can be used generally to handle llarge quantities of data generated in sequence. We must transform the data in order to be able to do this type of analysis
- essentially finding a different way to cut up the entire data set with 'bins'

In [71]:
results = [12,34,67,55,28,90,99,12,3,56,74,44,87,23,49,89,87]

In [72]:
#we can see the range is inbetween 1-100
bins = [0,25,50,75,100]

In [73]:
#then we can use cut() and apply it to the array of results also passing the bins
cat = pd.cut(results, bins)
cat

[(0, 25], (25, 50], (50, 75], (50, 75], (25, 50], ..., (75, 100], (0, 25], (25, 50], (75, 100], (75, 100]]
Length: 17
Categories (4, interval[int64]): [(0, 25] < (25, 50] < (50, 75] < (75, 100]]

In [74]:
#this is CATEGORICAL data, which is really just an array of strings indicating
#the name of te bin, internally it contains a categories aray with a list of numbers equal
#to the results elements
cat.categories

IntervalIndex([(0, 25], (25, 50], (50, 75], (75, 100]]
              closed='right',
              dtype='interval[int64]')

In [75]:
cat.codes 

array([0, 1, 2, 2, 1, 3, 3, 0, 0, 2, 2, 1, 3, 0, 1, 3, 3], dtype=int8)

In [76]:
pd.value_counts(cat) #counts within the bins how many elements are inside
#also notation is mathematically consistent

(75, 100]    5
(50, 75]     4
(25, 50]     4
(0, 25]      4
dtype: int64

In [77]:
bin_names = ['unlikely','less likely','likely','highly likely']
pd.cut(results, bins, labels = bin_names)

[unlikely, less likely, likely, likely, less likely, ..., highly likely, unlikely, less likely, highly likely, highly likely]
Length: 17
Categories (4, object): [unlikely < less likely < likely < highly likely]

In [78]:
# # can also pass an integer into cut()
pd.cut(results,5)

[(2.904, 22.2], (22.2, 41.4], (60.6, 79.8], (41.4, 60.6], (22.2, 41.4], ..., (79.8, 99.0], (22.2, 41.4], (41.4, 60.6], (79.8, 99.0], (79.8, 99.0]]
Length: 17
Categories (5, interval[float64]): [(2.904, 22.2] < (22.2, 41.4] < (41.4, 60.6] < (60.6, 79.8] < (79.8, 99.0]]

In [79]:
# also offers qcut() which divides into quintiles
#depending on the distribution of the data we may have uneven numbers of elements
#for each bin
quints = pd.qcut(results,5)
quints

[(2.999, 24.0], (24.0, 46.0], (62.6, 87.0], (46.0, 62.6], (24.0, 46.0], ..., (62.6, 87.0], (2.999, 24.0], (46.0, 62.6], (87.0, 99.0], (62.6, 87.0]]
Length: 17
Categories (5, interval[float64]): [(2.999, 24.0] < (24.0, 46.0] < (46.0, 62.6] < (62.6, 87.0] < (87.0, 99.0]]

In [80]:
pd.value_counts(quints)

(62.6, 87.0]     4
(2.999, 24.0]    4
(87.0, 99.0]     3
(46.0, 62.6]     3
(24.0, 46.0]     3
dtype: int64

## Detecting and Filtering Outliers

In [81]:
randframe = pd.DataFrame(np.random.randn(1000,3))
randframe

Unnamed: 0,0,1,2
0,-0.517140,0.481105,-1.268702
1,1.637805,0.158059,0.093738
2,0.791594,-0.543409,1.063474
3,0.621229,0.599872,-0.448709
4,0.740268,0.399816,-1.144678
5,-1.099362,0.173093,-0.374620
6,1.794700,2.523520,0.089219
7,-1.725432,-0.065830,1.283164
8,-0.057344,-0.427275,0.592105
9,-0.838041,0.757798,0.986780


In [82]:
randframe.describe() #shows stats like R

Unnamed: 0,0,1,2
count,1000.0,1000.0,1000.0
mean,-0.003195,-0.021156,0.012865
std,0.999244,0.981258,0.972583
min,-2.667525,-3.13403,-2.809642
25%,-0.656319,-0.631038,-0.668819
50%,-0.003267,-0.005868,0.028714
75%,0.686394,0.674724,0.678708
max,3.085942,2.94421,3.249786


In [83]:
randframe.std() #may have to consider outliers that are 3 times the size of the stand dev

0    0.999244
1    0.981258
2    0.972583
dtype: float64

In [84]:
#now we can filter using some clever coding
randframe[(np.abs(randframe) > (3*randframe.std())).any(1)]
#any() function you can apply the filter on each column

Unnamed: 0,0,1,2
188,3.085942,0.205779,-1.698249
420,0.85354,1.186888,3.249786
468,-1.297848,-3.13403,0.011613
618,-1.280407,1.171836,3.015949
646,0.885498,2.94421,0.842937


## Permutation
- operations of permutation (random reordering) of a series or the rows of a df area esy to do using numpy.random.permutation function

In [85]:
nframe = pd.DataFrame(np.arange(25).reshape(5,5))
nframe

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
1,5,6,7,8,9
2,10,11,12,13,14
3,15,16,17,18,19
4,20,21,22,23,24


In [86]:
new_order = np.random.permutation(5) #now create an array from 0 to 4 arranged in random
                                     #with permuation function, this will be the new order
new_order                            #in which to set the values in the df

array([3, 0, 4, 2, 1])

In [87]:
#now we can apply it to all lines using the take() function
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
0,0,1,2,3,4
4,20,21,22,23,24
2,10,11,12,13,14
1,5,6,7,8,9


In [88]:
#can even submit a portion
new_order = [3,4,2]
nframe.take(new_order)

Unnamed: 0,0,1,2,3,4
3,15,16,17,18,19
4,20,21,22,23,24
2,10,11,12,13,14


## Random Sampling

In [89]:
sample = np.random.randint(0, len(nframe), size=3)
sample

array([0, 2, 2])

In [90]:
nframe.take(sample) #can get the same sample even more often so be careful

Unnamed: 0,0,1,2,3,4
0,0,1,2,3,4
2,10,11,12,13,14
2,10,11,12,13,14


# String Manipulation

## Built-in Methods for String Manipulation
- in many cases you have composite strings in which you would like to separate the various parts and then assign them to the correct variable

In [91]:
text = '16 Bolton Ave, Boston'

In [92]:
text.split(',') #allows to separate by indicated character

['16 Bolton Ave', ' Boston']

In [93]:
#we have a string and a space character, so we can get rid of it as so:
tokens = [s.strip() for s in text.split(',')]
tokens

['16 Bolton Ave', 'Boston']

In [94]:
#this becomes an array of strings, can also do this method to make 2 assignments:
address ,city = [s.strip() for s in text.split(',')]
address

'16 Bolton Ave'

In [95]:
city

'Boston'

In [96]:
# sometimes you need to concatenate various strings between them to form a more 
#extended text, easiest way is to use + operator
address +','+ city

'16 Bolton Ave,Boston'

In [97]:
# can be useful when you have a small number of strings to concat. if you want multiple
#use join() function assigned to a separator character
strings = ['A+','A','A-','B','BB','BBB','C+']

In [98]:
';'.join(strings)

'A+;A;A-;B;BB;BBB;C+'

In [99]:
#also sometimes gotta look for shit so....
'Boston' in text

True

In [100]:
#also can use:
text.index('Boston')

15

In [101]:
text.find('Boston')

15

In [102]:
#in both cases it returns a number corresponding characters in the text where you found
#the substring
#text.index('New York') #returns an error if cannot be found

In [103]:
text.find('New York') #same as not finding it

-1

In [104]:
text.count('e')

1

In [105]:
text.count('Ave')

1

In [106]:
#can also replace
text.replace('Ave', 'Street')

'16 Bolton Street, Boston'

In [107]:
text.replace('St','Avenue')

'16 Bolton Ave, Boston'

In [108]:
#see? cool right?
text.replace('1',"") #can also replace with blank space

'6 Bolton Ave, Boston'

# Regular Expressions
- regular expressions provide a flexible way to search ad match string patterns within txt. 
- a single expression usually called a __regex__ is a string formed according to the reg expression language--> built in called re which is responsible for the operation of the regex
    - the _re_ module provides a set of functions that can be divided into 3 categories:
        1. Pattern Matching
        2. Substitution
        3. Splitting

In [109]:
import re
# regex for whitespace is \s+
text = "This is    an\t odd \n text!"
text

'This is    an\t odd \n text!'

In [110]:
print(text) #<- regex works in print

This is    an	 odd 
 text!


In [111]:
re.split('\s+',text) #this can accept a regex pattern for a criteria of seperation

['This', 'is', 'an', 'odd', 'text!']

- __let's take a closer look behind the scences__:
     - when you call re.split() the regex is first complied, then subsequently calls the split() function on the text argument
     - you can compile the regex function with re.compile(), thus retaining a reusable object regex and so gaining in terms of CPU cycles
         - this is especially true in operations of iterative search of a substring in a set or an array of strings

In [112]:
regex = re.compile('\s+')

In [113]:
#if you make a regex with the comile function you can apply split() directly to it as so:
regex.split(text)

['This', 'is', 'an', 'odd', 'text!']

In [114]:
#to match a regex pattern to any other business substrings in the text you can use the
#findall() function. returns a list of all substrings in a text that meet the requirements
#of the regex for ex:
text = 'This is my address: 15 Bolton Avenue, Boston'

In [115]:
re.findall('[A,a]\w+', text)
#here we wanted a string with all words startnig wtih "A" and "a"

['address', 'Avenue']

In [116]:
#this object does not contain the value of the substring that responds to the regex
#but returns start and end positions within the string
search = re.search('[A,a]\w+',text)
search #weird af object lol

<_sre.SRE_Match object; span=(11, 18), match='address'>

In [117]:
search.start() #doesn't contain the value of the substring

11

In [118]:
search.end() #but gives start and end of string

18

In [119]:
text[search.start():search.end()]

'address'

In [120]:
#match only matches to beginning of a string
# won't return results if no initial match
re.match('[A,a]\w+',text)

In [121]:
# if match does have a response it's similar as search object
re.match('T\w+',text)

<_sre.SRE_Match object; span=(0, 4), match='This'>

In [122]:
match = re.match('T\w+',text)

In [123]:
text[match.start():match.end()]

'This'

## Data Aggregation
 - last stage of data manpulation is data aggregation, which involves a transformation which produces a single integer from an array....examples include: sum(), count(), mean()
    - these operations do a calculation on the data, but a better way is to also include the categorization of a set
        -this can be carried out via grouping (groupby)
        - usually the 2 phases of grouping and application of a function are done in the same step

## GroupBy
- refers to a _split-apply-combine_ process:
    1. _splitting_: division into groups of datasets
    2. _applying_: application of a function on each group
    3. _combining_: combination of all the results obtained by different groups
        -splitting usually on some criteria, whereas in the jargon of SQL is passed as keys
        - applying usually a calculation of some sort
        -combining is self explanatory
### A Practical Example:

In [124]:
frame = pd.DataFrame({'color':['white','red','green','red','green'],
                     'object':['pen','pencil','pencil','ashtray','pen'],
                     'price1':[5.55,4.20,1.30,0.56,2.27],
                     'price2':[4.75,4.12,1.60,0.75,3.25]})
frame

Unnamed: 0,color,object,price1,price2
0,white,pen,5.55,4.75
1,red,pencil,4.2,4.12
2,green,pencil,1.3,1.6
3,red,ashtray,0.56,0.75
4,green,pen,2.27,3.25


In [125]:
#suppose we want to find the avg of the price1 col using group labels listed in the
#color col. we have several methods to do this:
group = frame['price1'].groupby(frame['color'])
group

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x112b074a8>

In [126]:
group.groups #the attribute group in groupby object 

{'green': Int64Index([2, 4], dtype='int64'),
 'red': Int64Index([1, 3], dtype='int64'),
 'white': Int64Index([0], dtype='int64')}

In [127]:
group.mean()

color
green    1.785
red      2.380
white    5.550
Name: price1, dtype: float64

In [128]:
group.sum()

color
green    3.57
red      4.76
white    5.55
Name: price1, dtype: float64

# Hierarchical Grouping
- we did grouping data according to values of a col as a key choice, same can be extended to mult cols i.e. making grouping keys hierarchical

In [129]:
ggroup = frame['price1'].groupby([frame['color'], frame['object']])
ggroup

<pandas.core.groupby.groupby.SeriesGroupBy object at 0x11332c898>

In [130]:
# so far we've done grouping using a single col but can be done across cols or to an 
#entire df, don't need to reuse groupby but can combine into a single passing all of
#the grouping and calculation to be done without defining any intermediate variable
frame[['price1','price2']].groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,1.785,2.425
red,2.38,2.435
white,5.55,4.75


In [131]:
frame.groupby(frame['color']).mean()

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,1.785,2.425
red,2.38,2.435
white,5.55,4.75


## Group Iteration

In [132]:
for name, group in frame.groupby('color'):
    print(name)
    print(group)

green
   color  object  price1  price2
2  green  pencil    1.30    1.60
4  green     pen    2.27    3.25
red
  color   object  price1  price2
1   red   pencil    4.20    4.12
3   red  ashtray    0.56    0.75
white
   color object  price1  price2
0  white    pen    5.55    4.75


## Chain of Transformations

In [133]:
# we have seen for each grouping we return a df or a series where it retains cols
#as indexes
result1 = frame['price1'].groupby(frame['color']).mean()
type(result1)

pandas.core.series.Series

In [134]:
result2 = frame.groupby(frame['color']).mean()
type(result2)

pandas.core.frame.DataFrame

In [135]:
frame['price1'].groupby(frame['color']).mean()

color
green    1.785
red      2.380
white    5.550
Name: price1, dtype: float64

In [136]:
(frame.groupby(frame['color']).mean())['price1']

color
green    1.785
red      2.380
white    5.550
Name: price1, dtype: float64

In [137]:
#can even add prefixes that describes the type of business combo to col name,
#which is very useful in keeping track of source data especially if you apply a process
#of transformation chain (a series or df is generated from each other)
means = frame.groupby('color').mean().add_prefix('mean_')
means

Unnamed: 0_level_0,mean_price1,mean_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,1.785,2.425
red,2.38,2.435
white,5.55,4.75


## Functions on Groups

In [138]:
group = frame.groupby('color')

In [139]:
group['price1'].quantile(0.6)

color
green    1.882
red      2.744
white    5.550
Name: price1, dtype: float64

In [140]:
#can also define own agg functions, define separately and pass to mark()
def vange(series):
    return series.max()-series.min()

In [141]:
group['price1'].agg(vange)

color
green    0.97
red      3.64
white    0.00
Name: price1, dtype: float64

In [142]:
group.agg(vange) #allows to use aggregation on entire df

Unnamed: 0_level_0,price1,price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,0.97,1.65
red,3.64,3.37
white,0.0,0.0


In [143]:
#can also use more than 1 at a time
group['price1'].agg(['mean','std',vange])

Unnamed: 0_level_0,mean,std,vange
color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
green,1.785,0.685894,0.97
red,2.38,2.573869,3.64
white,5.55,,0.0


## Advanced Data Aggregation

In [144]:
frame = pd.DataFrame({'color':['white','red','green','red','green'],
                     'price1':[5.56,4.20,1.30,.56,2.75],
                     'price2':[4.57,4.12,1.60,0.75,3.15]})
frame

Unnamed: 0,color,price1,price2
0,white,5.56,4.57
1,red,4.2,4.12
2,green,1.3,1.6
3,red,0.56,0.75
4,green,2.75,3.15


In [145]:
sums = frame.groupby('color').sum().add_prefix('tot_')
sums

Unnamed: 0_level_0,tot_price1,tot_price2
color,Unnamed: 1_level_1,Unnamed: 2_level_1
green,4.05,4.75
red,4.76,4.87
white,5.56,4.57


In [147]:
pd.merge(frame,sums, left_on='color',right_index=True)

Unnamed: 0,color,price1,price2,tot_price1,tot_price2
0,white,5.56,4.57,5.56,4.57
1,red,4.2,4.12,4.76,4.87
3,red,0.56,0.75,4.76,4.87
2,green,1.3,1.6,4.05,4.75
4,green,2.75,3.15,4.05,4.75


In [148]:
frame.groupby('color').transform(np.sum).add_prefix('tol_') #another wayto repeat the previous operation

Unnamed: 0,tol_price1,tol_price2
0,5.56,4.57
1,4.76,4.87
2,4.05,4.75
3,4.76,4.87
4,4.05,4.75


- the transform method is more specialized function that has very specific requirements: the function passed as an argument must produce a single scalar value (aggregation) to be broadcasted
- the method to cover more general GroupBy is applicable to apply(), which applies entirely the split-apply-combine schem
    - the function divides the oobject into aprts in order to be manipulated, invokes the passage of functions on each piece and then tries to chain together the various pairs

In [149]:
frame = pd.DataFrame({'color':['white','black','white','white','black','black'],
                     'status':['up','up','down','down','down','up'],
                     'value1':[12.33,14.55,22.34,27.84,23.40,18.33],
                     'value2':[11.23,31.80,29.99,31.18,18.35,22.44]})
frame

Unnamed: 0,color,status,value1,value2
0,white,up,12.33,11.23
1,black,up,14.55,31.8
2,white,down,22.34,29.99
3,white,down,27.84,31.18
4,black,down,23.4,18.35
5,black,up,18.33,22.44


In [151]:
frame.groupby(['color','status']).apply(lambda x: x.max())

Unnamed: 0_level_0,Unnamed: 1_level_0,color,status,value1,value2
color,status,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
black,down,black,down,23.4,18.35
black,up,black,up,18.33,31.8
white,down,white,down,27.84,31.18
white,up,white,up,12.33,11.23


In [152]:
frame.rename(index=reindex,columns=recolumn)

Unnamed: 0,color,status,value1,value2
first,white,up,12.33,11.23
second,black,up,14.55,31.8
third,white,down,22.34,29.99
fourth,white,down,27.84,31.18
fifth,black,down,23.4,18.35
5,black,up,18.33,22.44


In [154]:
temp = pd.date_range('1/1/2015',periods=10,freq='H')
temp

DatetimeIndex(['2015-01-01 00:00:00', '2015-01-01 01:00:00',
               '2015-01-01 02:00:00', '2015-01-01 03:00:00',
               '2015-01-01 04:00:00', '2015-01-01 05:00:00',
               '2015-01-01 06:00:00', '2015-01-01 07:00:00',
               '2015-01-01 08:00:00', '2015-01-01 09:00:00'],
              dtype='datetime64[ns]', freq='H')

In [155]:
timeseries = pd.Series(np.random.rand(10), index = temp)
timeseries

2015-01-01 00:00:00    0.324328
2015-01-01 01:00:00    0.881532
2015-01-01 02:00:00    0.569794
2015-01-01 03:00:00    0.248059
2015-01-01 04:00:00    0.805113
2015-01-01 05:00:00    0.253285
2015-01-01 06:00:00    0.078590
2015-01-01 07:00:00    0.272294
2015-01-01 08:00:00    0.200224
2015-01-01 09:00:00    0.772628
Freq: H, dtype: float64

In [156]:
timetable = pd.DataFrame({'date':temp, 'value1':np.random.rand(10),
                         'value2':np.random.rand(10)})
timetable

Unnamed: 0,date,value1,value2
0,2015-01-01 00:00:00,0.671301,0.048494
1,2015-01-01 01:00:00,0.819164,0.772978
2,2015-01-01 02:00:00,0.135264,0.116857
3,2015-01-01 03:00:00,0.109106,0.009462
4,2015-01-01 04:00:00,0.218845,0.257272
5,2015-01-01 05:00:00,0.552789,0.10799
6,2015-01-01 06:00:00,0.823517,0.056335
7,2015-01-01 07:00:00,0.548921,0.505311
8,2015-01-01 08:00:00,0.553706,0.298707
9,2015-01-01 09:00:00,0.923745,0.934513


In [159]:
#then we add to the df preceding a col that represents a set of text values that you use as key values
timetable['cat'] = ['up','down','left','left','up','up','down','right','right','up']
timetable
#this example has duplicate key values

Unnamed: 0,date,value1,value2,cat
0,2015-01-01 00:00:00,0.671301,0.048494,up
1,2015-01-01 01:00:00,0.819164,0.772978,down
2,2015-01-01 02:00:00,0.135264,0.116857,left
3,2015-01-01 03:00:00,0.109106,0.009462,left
4,2015-01-01 04:00:00,0.218845,0.257272,up
5,2015-01-01 05:00:00,0.552789,0.10799,up
6,2015-01-01 06:00:00,0.823517,0.056335,down
7,2015-01-01 07:00:00,0.548921,0.505311,right
8,2015-01-01 08:00:00,0.553706,0.298707,right
9,2015-01-01 09:00:00,0.923745,0.934513,up
