## Series

In [2]:
import numpy as np
import pandas as pd

In [3]:
labels = ['a','b','c']
my_list = [10,20,30]
arr = np.array([10,20,30])
d = {'a':10,'b':20,'c':100}

In [5]:
# pass a list and substitute numerical index for text labels
pd.Series(my_list, index=labels)

a    10
b    20
c    30
dtype: int64

In [6]:
# can pass a numpy array and the same thing with index
pd.Series(arr, labels)

a    10
b    20
c    30
dtype: int64

In [7]:
# can also pass in a dictionary
pd.Series(d)

a     10
b     20
c    100
dtype: int64

In [8]:
# series can also hold non-numerical data
pd.Series(labels)

0    a
1    b
2    c
dtype: object

In [9]:
# you can basically pass in any object - such as methods - and it won't complain
pd.Series([sum,len,print])

0      <built-in function sum>
1      <built-in function len>
2    <built-in function print>
dtype: object

In [10]:
ser1 = pd.Series([1,2,3,4], index=['USA','CHINA','FRANCE','GERMANY'])

In [11]:
ser1

USA        1
CHINA      2
FRANCE     3
GERMANY    4
dtype: int64

In [12]:
ser2 = pd.Series([1,2,3,4], index=['USA','CHINA','ITALY','NORWAY'])
ser2

USA       1
CHINA     2
ITALY     3
NORWAY    4
dtype: int64

In [13]:
# can call the value via the key
ser1['USA']

1

In [14]:
# will add the values of the two series where there is a match and NaN where not
ser1 + ser2

CHINA      4.0
FRANCE     NaN
GERMANY    NaN
ITALY      NaN
NORWAY     NaN
USA        2.0
dtype: float64

# DataFrames

In [16]:
from numpy.random import randn

In [17]:
np.random.seed(101)

In [18]:
# build basic DataFrame with 1) data as a random 5 x 4 matrix, 2) row labels, 3) column labels
df = pd.DataFrame(randn(5,4),['A','B','C','D','E'], ['W','X','Y','Z'])

In [19]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [20]:
# can use columns to select out the series
df['W']

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [22]:
# type will show that it's a series
type(df['W'])

pandas.core.series.Series

In [23]:
# type will show that df is a DataFrame
type(df)

pandas.core.frame.DataFrame

In [27]:
# can pass a list of columns to select multiple columns
df[['W','Z']]

Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [29]:
# can create a new column in DataFrame and add two columns together
df['new'] = df['W'] + df['X']

In [30]:
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [31]:
# can delete columns
df.drop('Y', axis = 1)

Unnamed: 0,W,X,Z,new
A,2.70685,0.628133,0.503826,3.334983
B,0.651118,-0.319318,0.605965,0.3318
C,-2.018168,0.740122,-0.589001,-1.278046
D,0.188695,-0.758872,0.955057,-0.570177
E,0.190794,1.978757,0.683509,2.169552


In [32]:
# can delete rows
df.drop('C', axis = 0)

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [33]:
# but this doesn't modify the base DataFrame
df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.334983
B,0.651118,-0.319318,-0.848077,0.605965,0.3318
C,-2.018168,0.740122,0.528813,-0.589001,-1.278046
D,0.188695,-0.758872,-0.933237,0.955057,-0.570177
E,0.190794,1.978757,2.605967,0.683509,2.169552


In [34]:
# in order to affect the base DataFrame, you have to set inplace = True
# this is where you "commit" the changes
df.drop('new',axis=1,inplace=True)

In [35]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [38]:
# ROW SELECTION
# can select rows using loc
df.loc['C']

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [37]:
# just like columns, rows are series also
type(df.loc['C'])

pandas.core.series.Series

In [39]:
# can also use iloc to select numerically
df.iloc[2]

W   -2.018168
X    0.740122
Y    0.528813
Z   -0.589001
Name: C, dtype: float64

In [41]:
# can select a single cell from matrix
df.loc['B','X']

-0.31931804459303326

In [43]:
# can select a subset of the DataFrame for a new DataFrame
df.iloc[1:3,1:4]

Unnamed: 0,X,Y,Z
B,-0.319318,-0.848077,0.605965
C,0.740122,0.528813,-0.589001


In [48]:
# can do the same using the index names
df.loc[['B','C'],['X','Y','Z']]

Unnamed: 0,X,Y,Z
B,-0.319318,-0.848077,0.605965
C,0.740122,0.528813,-0.589001


In [49]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [50]:
# can do conditional searches quickly
df > 0

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [51]:
# can make a variable of this
booldf = df > 0

# and then use it as a selection criteria
df[booldf]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [52]:
# can do the same in just one line
df[df > 0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [53]:
# column selection
df['W'] > 0

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [55]:
# select * from df where W > 0
df[df['W']>0]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [56]:
# select * from df where Z < 0
df[df['Z']<0]

Unnamed: 0,W,X,Y,Z
C,-2.018168,0.740122,0.528813,-0.589001


In [58]:
# can select from original dataframe to make a "temp" table and then select off of that
resultdf = df[df['Z']>0]
resultdf['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [60]:
# the above can be done in one step by stacking commands
df[df['Z']>0]['X']

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

In [62]:
# likewise, you can use the selection criterian and then stack a dataframe select off of it
df[df['Z']>0][['X','Y','Z']]

Unnamed: 0,X,Y,Z
A,0.628133,0.907969,0.503826
B,-0.319318,-0.848077,0.605965
D,-0.758872,-0.933237,0.955057
E,1.978757,2.605967,0.683509


In [63]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [65]:
# likewise you can do multiple conditions
# select W, X from df where Y > 0 and Z > 0
df[(df['Z']>0) & (df['Y']>0)][['W','X']]

Unnamed: 0,W,X
A,2.70685,0.628133
E,0.190794,1.978757


In [66]:
# can also do an OR condition
# select W, X from df where Y > 0 or Z > 0
df[(df['Z']>0) | (df['Y']>0)][['W','X']]

Unnamed: 0,W,X
A,2.70685,0.628133
B,0.651118,-0.319318
C,-2.018168,0.740122
D,0.188695,-0.758872
E,0.190794,1.978757


In [67]:
# reset the index use the following method
df.reset_index()
# this moves the original index to a column and puts the default index in
# to do this permanently, you'd need to add inplace = True

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [69]:
# can add a new column and use the split method to avoid having to type all the commas
newind = 'CA CO VA NY WV'.split()
newind

['CA', 'CO', 'VA', 'NY', 'WV']

In [70]:
df['States'] = newind
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,CO
C,-2.018168,0.740122,0.528813,-0.589001,VA
D,0.188695,-0.758872,-0.933237,0.955057,NY
E,0.190794,1.978757,2.605967,0.683509,WV


In [72]:
df.drop('States', axis = 1, inplace=True)

In [73]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [74]:
# the above column add can be done in one line
df['States'] = 'CA CO VA NY WV'.split()
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,CO
C,-2.018168,0.740122,0.528813,-0.589001,VA
D,0.188695,-0.758872,-0.933237,0.955057,NY
E,0.190794,1.978757,2.605967,0.683509,WV


In [75]:
# can set added column as the index
df.set_index('States')

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
CO,0.651118,-0.319318,-0.848077,0.605965
VA,-2.018168,0.740122,0.528813,-0.589001
NY,0.188695,-0.758872,-0.933237,0.955057
WV,0.190794,1.978757,2.605967,0.683509


In [76]:
# still must use inplace=True to make it permanent
df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,CO
C,-2.018168,0.740122,0.528813,-0.589001,VA
D,0.188695,-0.758872,-0.933237,0.955057,NY
E,0.190794,1.978757,2.605967,0.683509,WV
