## Series

In [61]:
import numpy as np

In [62]:
import pandas as pd

In [63]:
labels = ['a', 'b', 'c']
my_data = [10, 20, 30]
arr = np.array(my_data)
d = {'a':10,'b':20,'c':30}

In [64]:
pd.Series(my_data)
#gives us a default index (0, 1, 2) for the data (10, 20, 30)

0    10
1    20
2    30
dtype: int64

In [65]:
pd.Series(my_data, labels)
# (data, index)
# gives us the index we specified (labels = a,b,c) for the data

a    10
b    20
c    30
dtype: int64

In [66]:
pd.Series(arr, labels)
# (data, index)
# can use an array just like a list

a    10
b    20
c    30
dtype: int32

In [67]:
pd.Series(d)
#can pass a DICTIONARY, where the keys become the index

a    10
b    20
c    30
dtype: int64

##### Panda Series can hold a variety of data types (unlike arrays which can only hold numbers)

In [68]:
ser1 = pd.Series([1,2,3,4],['USA','Germany','USSR','Japan'])
# (data, index)
ser1
# so the country names are the indexes, and the numbers are the data

USA        1
Germany    2
USSR       3
Japan      4
dtype: int64

In [69]:
ser2 = pd.Series([1,2,5,4],['USA','Germany','Italy','Japan'])
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

In [70]:
#get data out like you would  a dictionary
ser1['USA']

1

In [71]:
ser3 = pd.Series(labels)
ser3

0    a
1    b
2    c
dtype: object

In [72]:
ser3[0]

'a'

In [73]:
ser1 + ser2
#where there's not a matching index, you get a null

Germany    4.0
Italy      NaN
Japan      8.0
USA        2.0
USSR       NaN
dtype: float64

In [74]:
ser2

USA        1
Germany    2
Italy      5
Japan      4
dtype: int64

## DataFrames

In [75]:
from numpy.random import randn


In [76]:
np.random.seed(101) #this is a way to get "consistent" random numbers. 
# wouldn't use this normally

In [77]:
df = pd.DataFrame(randn(5,4), ['A','B','C','D','E'],['W','X','Y','Z']) # like a series, but also has a Columns argument
# (data, index/rows, columns)
df 
#it's basically several series (columns) that share the same index (rows)

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [78]:
# How to get data out?
df['W'] 
#gets the W column

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [79]:
type(df['W']) #see? a dataframe column is a series

pandas.core.series.Series

In [80]:
type(df)

pandas.core.frame.DataFrame

In [81]:
df[['W', 'X']] # to GEt multiple COLUMNS, input a LIST (so need extra brackets)

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


In [82]:
#to CREATE A NEW COLUMN
df['new'] = df['W'] + df['Y']
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [55]:
#to REMOVE COLUMN
df.drop('new',1) 
#need to specify the axis as "1" because default axis 0 is the index

Unnamed: 0,W,X,Y,Z
A,-0.993263,0.1968,-1.136645,0.000366
B,1.025984,-0.156598,-0.031579,0.649826
C,2.154846,-0.610259,-0.755325,-0.346419
D,0.147027,-0.479448,0.558769,1.02481
E,-0.925874,1.862864,-1.133817,0.610478


In [56]:
#NOTE: the drop doesn't actually MODIFY the dataframe
df

Unnamed: 0,W,X,Y,Z,new
A,-0.993263,0.1968,-1.136645,0.000366,-2.129908
B,1.025984,-0.156598,-0.031579,0.649826,0.994405
C,2.154846,-0.610259,-0.755325,-0.346419,1.399521
D,0.147027,-0.479448,0.558769,1.02481,0.705796
E,-0.925874,1.862864,-1.133817,0.610478,-2.059691


In [83]:
#IF you want to actually REMOVE from DATAFRAME, need to also specify "inplace"
df.drop('new',axis=1,inplace=True)

In [84]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [85]:
df.drop('E') #can also drop an entire row/index

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [87]:
df.drop('E',inplace=True)
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [91]:
df.loc['C'] #to GET a ROW
#(recall, to get COLUMNS, use df[['W', 'X']])

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [90]:
df.iloc[2] #can also use numerical index using iloc

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [93]:
df.loc['B','Y'] #gets value from ROW B, COLUMN Y (just like numpy)

-0.84807698340363147

In [94]:
df.loc[['A','B'],['W','Y']]
# [ROWS, COlUMNS]

Unnamed: 0,W,Y
A,2.70685,0.907969
B,0.651118,-0.848077


#### Conditional Selection

In [115]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [116]:
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True


In [97]:
df[df>0] # will get values where TRUE and NaN (null) where FALSE

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057


In [99]:
df['W']>0

A     True
B     True
C    False
D     True
Name: W, dtype: bool

In [100]:
df[df['W']>0] 
#NOTE: does not give NaNs/nulls because it excludes the rows 
# that do not meet criteria

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057


In [104]:
df[df['W']>0][['X','Y']] 
#since the result of df[df['W']>0] is a dataframe
# you can then get columns or other data futher

Unnamed: 0,X,Y
A,0.628133,0.907969
B,-0.319318,-0.848077
D,-0.758872,-0.933237


In [113]:
# MULTIPLE CONDITIONS
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [111]:
df[(df['W']>1) & (df['Y']>0)]  
# must use "&" instead of "and" because comparing 2 series

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826


In [112]:
df[(df['W']>1) | (df['Y']>0)]  
# pipe | for "or"

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
C,-2.018168,0.740122,0.528813,-0.589001


In [118]:
df.reset_index() 
#RESET the INDEX to numerical (will not occur inplace unless specified)

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057


In [123]:
newind = 'CA NY WY OR'.split()
newind

['CA', 'NY', 'WY', 'OR']

In [127]:
df['States'] = newind #first, adding newind to df as a new column
df.set_index('States') # SET States as the INDEX

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057


#### Multi Index and Index Hierarchy

In [135]:
# constructing a multi-index level dataframe
outside = 'G1 G1 G1 G2 G2 G2'.split()
inside = '1 2 3 1 2 3'.split()
hier_index = list(zip(outside, inside)) 
#turns 2 lists into list of tuple pairs
hier_index = pd.MultiIndex.from_tuples(hier_index) 
#turns the tuple pairs into multiIndex
print(outside)
print(inside)
hier_index

['G1', 'G1', 'G1', 'G2', 'G2', 'G2']
['1', '2', '3', '1', '2', '3']


MultiIndex(levels=[['G1', 'G2'], ['1', '2', '3']],
           labels=[[0, 0, 0, 1, 1, 1], [0, 1, 2, 0, 1, 2]])

In [139]:
df = pd.DataFrame(randn(6,2),hier_index,['A','B'])
    #(data, index (row labels), column labels)
df
#a multi-index dataframe

Unnamed: 0,Unnamed: 1,A,B
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


In [141]:
df.loc['G1'].loc['1']

A    1.025984
B   -0.156598
Name: 1, dtype: float64

In [143]:
df.index.names = ['Groups','Num'] #giving the indexes names
df

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
Groups,Num,Unnamed: 2_level_1,Unnamed: 3_level_1
G1,1,1.025984,-0.156598
G1,2,-0.031579,0.649826
G1,3,2.154846,-0.610259
G2,1,-0.755325,-0.346419
G2,2,0.147027,-0.479448
G2,3,0.558769,1.02481


In [144]:
df.loc['G2'].loc['2','B'] #to get value from index G2, index 2, column B

-0.47944803904109595

In [145]:
df.loc['G1'].loc['3','A']

2.1548464432594718

In [149]:
df.xs('1', level = 'Num') 
#get CROSS-SECTION (i.e., can get index 1 from both G1 and G2)


Unnamed: 0_level_0,A,B
Groups,Unnamed: 1_level_1,Unnamed: 2_level_1
G1,1.025984,-0.156598
G2,-0.755325,-0.346419
