## Series Data Structure

In [15]:
# pd.Series(data,index)
# index-> Unique, Hashable, same length as data. By default np.arange(n)

import pandas as pd

s = pd.Series()

print(s)

Series([], dtype: float64)


### Creating Series from ndarray

In [3]:
import numpy as np

data = np.array(['a', 'b', 'c', 'd'])

s = pd.Series(data)

print(s)

0    a
1    b
2    c
3    d
dtype: object


### Create Series from dict

In [16]:
import pandas as pd

data = {'a':0., 'b':1., 'c':2.}

s = pd.Series(data)

print(s)

a    0.0
b    1.0
c    2.0
dtype: float64


### Data accessing using Index

In [2]:
s = pd.Series([1, 2, 3, 4, 5], index=['a', 'b', 'c', 'd', 'e'])

print(s)

a    1
b    2
c    3
d    4
e    5
dtype: int64


In [3]:
print(s['a'])

1


In [4]:
# Retrieve multiple elements

print(s[['a', 'b', 'e']])

a    1
b    2
e    5
dtype: int64


In [18]:
print(s['f'])

KeyError: 'f'

### Re indexing

In [20]:
N = 20

df = pd.DataFrame({
    'A':pd.date_range(start='2018-08-13', periods=N),
    'x':np.linspace(0, stop=N-1,num=N),
    'y':np.random.rand(N),
    'C':np.random.choice(['Low','Medium','High'],N).tolist(),
    'D':np.random.normal(100, 10, size=(N))
})

print(df)

            A     x         y       C           D
0  2018-08-13   0.0  0.297448  Medium   81.740434
1  2018-08-14   1.0  0.759365     Low  102.523057
2  2018-08-15   2.0  0.329398  Medium  105.657720
3  2018-08-16   3.0  0.312992     Low  115.064860
4  2018-08-17   4.0  0.981180     Low   99.574278
5  2018-08-18   5.0  0.726843  Medium  107.051931
6  2018-08-19   6.0  0.853815  Medium  100.786615
7  2018-08-20   7.0  0.345892  Medium   98.275424
8  2018-08-21   8.0  0.305442     Low  111.489658
9  2018-08-22   9.0  0.547652     Low  102.570092
10 2018-08-23  10.0  0.339556  Medium   98.651667
11 2018-08-24  11.0  0.725534  Medium  102.805008
12 2018-08-25  12.0  0.762901    High  103.138324
13 2018-08-26  13.0  0.363608  Medium   92.055454
14 2018-08-27  14.0  0.226582  Medium  117.050160
15 2018-08-28  15.0  0.483699    High   93.218420
16 2018-08-29  16.0  0.720074  Medium  105.006429
17 2018-08-30  17.0  0.245495    High   88.087332
18 2018-08-31  18.0  0.626002     Low   85.527901


In [17]:
df.describe()

Unnamed: 0,x,y,D
count,20.0,20.0,20.0
mean,9.5,0.552822,99.320158
std,5.91608,0.302162,10.452179
min,0.0,0.035458,81.389851
25%,4.75,0.343012,93.162831
50%,9.5,0.589448,99.085391
75%,14.25,0.809032,108.101733
max,19.0,0.920862,114.271997


In [18]:
df_reindexed = df.reindex(index=[0,2,5], columns=['A','C','B'])

print(df_reindexed)

           A       C   B
0 2018-08-13  Medium NaN
2 2018-08-15  Medium NaN
5 2018-08-18    High NaN


In [13]:
# Reindex to align with other Objects

df_1 = pd.DataFrame(np.random.randn(10,3), columns=['col1', 'col2', 'col3'])
df_2 = pd.DataFrame(np.random.randn(7, 3), columns=['col1', 'col2', 'col3'])

print(df_1)

print('*'*50)

print(df_2)

       col1      col2      col3
0  0.510873 -0.821334  0.168419
1 -1.144135 -0.843752  0.663920
2  0.507895  2.292869 -2.410000
3  0.281911 -1.720111 -1.287705
4  0.889520  0.377303  0.735085
5  0.520214  1.185056 -1.143105
6 -1.977482 -1.402801 -0.324095
7 -0.932744  2.276988  1.564680
8 -1.103110  1.338715  0.347766
9 -1.532697  0.914440  0.509173
**************************************************
       col1      col2      col3
0  0.563894 -0.649711 -2.090522
1  0.742647 -0.278773 -0.712817
2 -1.375288  0.102734  0.262869
3  0.875239 -1.780812 -0.076995
4 -0.715191  0.246055  0.178436
5 -0.437486  0.745659 -0.307926
6 -0.596179 -1.897721 -1.786715


In [14]:
df_1.describe()

Unnamed: 0,col1,col2,col3
count,10.0,10.0,10.0
mean,-0.397975,0.359737,-0.117586
std,1.039869,1.477085,1.182167
min,-1.977482,-1.720111,-2.41
25%,-1.133879,-0.838147,-0.938353
50%,-0.325417,0.645872,0.258093
75%,0.510129,1.3003,0.625234
max,0.88952,2.292869,1.56468


In [15]:
df_1 = df_1.reindex_like(df_2)

print(df_1)

# column name must match otherwise NAN will be added

       col1      col2      col3
0  0.510873 -0.821334  0.168419
1 -1.144135 -0.843752  0.663920
2  0.507895  2.292869 -2.410000
3  0.281911 -1.720111 -1.287705
4  0.889520  0.377303  0.735085
5  0.520214  1.185056 -1.143105
6 -1.977482 -1.402801 -0.324095


## Numerical Indexing

### .loc() & .iloc()

In [7]:
import pandas as pd

df = pd.DataFrame({'col_1': [10,20,30,40,50,60,70,80]}, \
                  index=[3,4,5,6,7,8,9,0])

df

Unnamed: 0,col_1
3,10
4,20
5,30
6,40
7,50
8,60
9,70
0,80


In [9]:
df.loc[3]

col_1    10
Name: 3, dtype: int64

In [10]:
df.iloc[3]

col_1    40
Name: 6, dtype: int64

In [12]:
df.loc[:4]

Unnamed: 0,col_1
3,10
4,20


In [13]:
df.iloc[:4]

Unnamed: 0,col_1
3,10
4,20
5,30
6,40


## Missing Data

### When and why is data missed?

### How to check missing values?

In [27]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index = ['a', 'c', 'e', 'f', 'h']\
                 , columns = ['one', 'two', 'three'])

print(df)

print('*'*50)

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print(df)

        one       two     three
a  0.615638  1.429376  0.752544
c  0.194564  0.983944 -0.792175
e -0.425801 -1.780161  0.505067
f  0.404718  2.004841  1.407920
h  0.062916 -0.773470 -0.100087
**************************************************
        one       two     three
a  0.615638  1.429376  0.752544
b       NaN       NaN       NaN
c  0.194564  0.983944 -0.792175
d       NaN       NaN       NaN
e -0.425801 -1.780161  0.505067
f  0.404718  2.004841  1.407920
g       NaN       NaN       NaN
h  0.062916 -0.773470 -0.100087


In [28]:
# Check for missing values

print(df['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [29]:
print(df['one'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


### Calculations with missing values

In [30]:
# When summing the data, NA will be treated as zero

print(df['one'].sum())

0.8520348261281738


In [31]:
# If the data are all NA, then the result will be NA/Zero

df_na = pd.DataFrame(index=[0,1,2,3,4,5], columns=['one', 'two'])

print(df_na)

print('*'*50)

print(df_na['one'].sum())

   one  two
0  NaN  NaN
1  NaN  NaN
2  NaN  NaN
3  NaN  NaN
4  NaN  NaN
5  NaN  NaN
**************************************************
0


### Cleaning missing data

In [19]:
# Replacing nan with a Scalar value

import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(3, 3), index=['a', 'c', 'e'], \
            columns=['one', 'two', 'three'])

print(df)

print('*'*50)

df = df.reindex(['a', 'b', 'c'])

print(df)

print('*'*50)

print(df.fillna(3))

        one       two     three
a  0.703899  0.859852 -0.069021
c  0.990556  1.175574 -0.512586
e -0.093659 -1.733010 -0.469784
**************************************************
        one       two     three
a  0.703899  0.859852 -0.069021
b       NaN       NaN       NaN
c  0.990556  1.175574 -0.512586
**************************************************
        one       two     three
a  0.703899  0.859852 -0.069021
b  3.000000  3.000000  3.000000
c  0.990556  1.175574 -0.512586


### Drop missing values

In [36]:
import pandas as pd
import numpy as np

df = pd.DataFrame(np.random.randn(5, 3), index = ['a', 'c', 'e', 'f', 'h']\
                 , columns = ['one', 'two', 'three'])

print(df)

print('*'*50)

df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])

print(df)

        one       two     three
a  0.439865  0.534521  0.939903
c  0.184990 -0.334263 -0.833076
e  0.940437 -0.395219 -0.018264
f  1.020597  0.872912 -1.341979
h  0.255568  0.455560  0.197888
**************************************************
        one       two     three
a  0.439865  0.534521  0.939903
b       NaN       NaN       NaN
c  0.184990 -0.334263 -0.833076
d       NaN       NaN       NaN
e  0.940437 -0.395219 -0.018264
f  1.020597  0.872912 -1.341979
g       NaN       NaN       NaN
h  0.255568  0.455560  0.197888


In [37]:
print(df.dropna()) # by default drops row wise

        one       two     three
a  0.439865  0.534521  0.939903
c  0.184990 -0.334263 -0.833076
e  0.940437 -0.395219 -0.018264
f  1.020597  0.872912 -1.341979
h  0.255568  0.455560  0.197888


In [38]:
print(df.dropna(axis=1)) # column wise

Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


## Data

Numbers

Categorical

## A Small Case Study

http://analytics-magazine.org/missing-values/

<b>Major takeaway:</b>
1. Missing values in the data create uncertainty for the analyst and the information consumer because decisions need to be made without having the full picture.
2. Missing values can also reduce the number of usable records for the analysis, or force analysts to eliminate variables from the analysis.
3. Consequently, if an observation has a missing value in any of the required variables, the whole observation (data record) needs to be omitted from the analysis.
4. Other options would be to exclude it from the analysis variable as a whole or to insert imputation values for the missing data points.
