In [1]:
# from Learning Pandas
# https://www.packtpub.com/packtlib/book/Application-Development/9781783985128/1/ch01lvl1sec11/Primary%20pandas%20objects

# import numpy and pandas, and DataFrame / Series
import numpy as np
import pandas as pd
from pandas import DataFrame, Series

# Set some pandas options
pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

# And some items for matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
#pd.options.display.mpl_style = 'default' #issues with matplotlib
plt.style.use('default')

In [2]:
s = Series([1, 2, 3, 4])
s

0    1
1    2
2    3
3    4
dtype: int64

In [3]:
s[[1, 3]]

1    2
3    4
dtype: int64

In [4]:
s = Series([1, 2, 3, 4], 
           index = ['a', 'b', 'c', 'd'])
s

a    1
b    2
c    3
d    4
dtype: int64

In [5]:
s[['a', 'd']]

a    1
d    4
dtype: int64

In [6]:
s[[1, 2]]

b    2
c    3
dtype: int64

In [7]:
s.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [8]:
dates = pd.date_range('2014-07-01', '2014-07-06')
dates

DatetimeIndex(['2014-07-01', '2014-07-02', '2014-07-03', '2014-07-04',
               '2014-07-05', '2014-07-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
temps1 = Series([80, 82, 85, 90, 83, 87], 
                 index = dates)
temps1

2014-07-01    80
2014-07-02    82
2014-07-03    85
2014-07-04    90
2014-07-05    83
2014-07-06    87
Freq: D, dtype: int64

In [10]:
temps1.mean()

84.5

In [11]:
temps2 = Series([70, 75, 69, 83, 79, 77],
                index = dates)

temp_diffs = temps1 - temps2
temp_diffs

2014-07-01    10
2014-07-02     7
2014-07-03    16
2014-07-04     7
2014-07-05     4
2014-07-06    10
Freq: D, dtype: int64

In [12]:
temp_diffs['2014-07-03']

16

In [13]:
temp_diffs[2]

16

**DataFrame**

In [14]:
temps_df = DataFrame(
    {'Missoula': temps1,
     'Philadelphia': temps2})
temps_df

            Missoula  Philadelphia
2014-07-01        80            70
2014-07-02        82            75
2014-07-03        85            69
2014-07-04        90            83
2014-07-05        83            79
2014-07-06        87            77

In [15]:
temps_df['Missoula']

2014-07-01    80
2014-07-02    82
2014-07-03    85
2014-07-04    90
2014-07-05    83
2014-07-06    87
Freq: D, Name: Missoula, dtype: int64

In [16]:
temps_df['Philadelphia']

2014-07-01    70
2014-07-02    75
2014-07-03    69
2014-07-04    83
2014-07-05    79
2014-07-06    77
Freq: D, Name: Philadelphia, dtype: int64

In [17]:
temps_df[['Philadelphia', 'Missoula']]

            Philadelphia  Missoula
2014-07-01            70        80
2014-07-02            75        82
2014-07-03            69        85
2014-07-04            83        90
2014-07-05            79        83
2014-07-06            77        87

In [18]:
temps_df.Missoula

2014-07-01    80
2014-07-02    82
2014-07-03    85
2014-07-04    90
2014-07-05    83
2014-07-06    87
Freq: D, Name: Missoula, dtype: int64

In [19]:
temps_df.Missoula - temps_df.Philadelphia

2014-07-01    10
2014-07-02     7
2014-07-03    16
2014-07-04     7
2014-07-05     4
2014-07-06    10
Freq: D, dtype: int64

In [20]:
temps_df['Difference'] = temp_diffs
temps_df

            Missoula  Philadelphia  Difference
2014-07-01        80            70          10
2014-07-02        82            75           7
2014-07-03        85            69          16
2014-07-04        90            83           7
2014-07-05        83            79           4
2014-07-06        87            77          10

In [21]:
temps_df.columns

Index(['Missoula', 'Philadelphia', 'Difference'], dtype='object')

In [22]:
temps_df.Difference[1:4]

2014-07-02     7
2014-07-03    16
2014-07-04     7
Freq: D, Name: Difference, dtype: int64

In [23]:
temps_df.iloc[1]

Missoula        82
Philadelphia    75
Difference       7
Name: 2014-07-02 00:00:00, dtype: int64

In [24]:
temps_df.ix[1].index

Index(['Missoula', 'Philadelphia', 'Difference'], dtype='object')

In [25]:
temps_df.loc['2014-07-03']

Missoula        85
Philadelphia    69
Difference      16
Name: 2014-07-03 00:00:00, dtype: int64

In [26]:
temps_df.iloc[[1, 3, 5]].Difference

2014-07-02     7
2014-07-04     7
2014-07-06    10
Freq: 2D, Name: Difference, dtype: int64

In [27]:
temps_df.Missoula > 82

2014-07-01    False
2014-07-02    False
2014-07-03     True
2014-07-04     True
2014-07-05     True
2014-07-06     True
Freq: D, Name: Missoula, dtype: bool

In [28]:
temps_df[temps_df.Missoula > 82]

            Missoula  Philadelphia  Difference
2014-07-03        85            69          16
2014-07-04        90            83           7
2014-07-05        83            79           4
2014-07-06        87            77          10

Loading Data from files and the Web

In [29]:
!cat Data/pandas_test1.csv

cat: Data/pandas_test1.csv: No such file or directory


In [30]:
df = pd.read_csv('Data/pandas_test1.csv', skipinitialspace=True)
df

OSError: File b'Data/pandas_test1.csv' does not exist

In [None]:
df.date

In [None]:
df.date[0]

In [None]:
type(df.date[0])

In [None]:
df = pd.read_csv('Data/pandas_test1.csv', 
                 skipinitialspace=True,
                 parse_dates=['date'])
df

In [31]:
type(df.date[0])

NameError: name 'df' is not defined

In [32]:
df.index

NameError: name 'df' is not defined

In [33]:
df = pd.read_csv('Data/pandas_test1.csv', 
                 parse_dates=['date'],
                 skipinitialspace=True,
                 index_col='date')
df

OSError: File b'Data/pandas_test1.csv' does not exist

In [34]:
df.index

NameError: name 'df' is not defined

In [35]:
#imports fpr reading data from Yahoo!
from pandas.io.data import DataReader
# from pandas_datareader.data import DataReader
from datetime import date
from dateutil.relativedelta import relativedelta

goog = DataReader("GOOG", "yahoo",
                  date.today() +
                  relativedelta(months=-3))

goog.tail()

The pandas.io.data module is moved to a separate package (pandas-datareader) and will be removed from pandas in a future version.
After installing the pandas-datareader package (https://github.com/pydata/pandas-datareader), you can change the import ``from pandas.io import data, wb`` to ``from pandas_datareader import data, wb``.


                  Open        High         Low       Close   Volume  \
Date                                                                  
2016-02-18  710.000000  712.349976  696.030029  697.349976  1859100   
2016-02-19  695.030029  703.080994  694.049988  700.909973  1582300   
2016-02-22  707.450012  713.239990  702.510010  706.460022  1946100   
2016-02-23  701.450012  708.400024  693.580017  695.849976  1999700   
2016-02-24  688.919983  700.000000  680.780029  699.559998  1958600   

             Adj Close  
Date                    
2016-02-18  697.349976  
2016-02-19  700.909973  
2016-02-22  706.460022  
2016-02-23  695.849976  
2016-02-24  699.559998  

In [36]:
#imports fpr reading data from Yahoo!
from pandas.io.data import DataReader
# from pandas_datareader.data import DataReader
from datetime import date
from dateutil.relativedelta import relativedelta

googG = DataReader("GOOG", "google",
                  date.today() +
                  relativedelta(months=-3))

googG.tail()

              Open    High     Low   Close   Volume
Date                                               
2016-02-18  710.00  712.35  696.03  697.35  1859130
2016-02-19  695.03  703.08  694.05  700.91  1582260
2016-02-22  707.45  713.24  702.51  706.46  1946067
2016-02-23  701.45  708.40  693.58  695.85  1999699
2016-02-24  688.92  700.00  680.78  699.56  1963573

In [37]:
#imports fpr reading data from Yahoo!
from pandas.io.data import DataReader
# from pandas_datareader.data import DataReader
from datetime import date
from dateutil.relativedelta import relativedelta

fred = DataReader("VIXCLS", "fred",
                  date.today() +
                  relativedelta(months=-3))

fred.tail()

            VIXCLS
DATE              
2016-02-17   22.31
2016-02-18   21.64
2016-02-19   20.53
2016-02-22   19.38
2016-02-23   20.98

In [38]:
goog.plot(y='Adj Close')

<matplotlib.axes._subplots.AxesSubplot at 0x10b92dbe0>

## Chapter 3 ##

Numpy for pandas

In [39]:
import numpy as np

In [40]:
def squares(values):
    result = []
    for v in values:
        result.append(v*v)
    return result
    
    
to_square = range(100000)

%timeit squares(to_square)

100 loops, best of 3: 16.4 ms per loop


In [41]:
# NumPy vectorization

array_to_square = np.arange(0, 100000)
%timeit array_to_square ** 2

The slowest run took 6.80 times longer than the fastest. This could mean that an intermediate result is being cached 
10000 loops, best of 3: 80.6 µs per loop


In [42]:
a1 = np.array([1, 2, 3, 4, 5])
a1

array([1, 2, 3, 4, 5])

In [43]:
# n-dimensional array
type(a1)

numpy.ndarray

In [44]:
np.size(a1)

5

In [45]:
a2 = np.array([1, 2, 3, 4.0, 5.0])
a2

array([ 1.,  2.,  3.,  4.,  5.])

In [46]:
a2.dtype

dtype('float64')

In [47]:
a3 = np.array([0] * 10)
a3

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [48]:
np.array(range(10))

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [49]:
np.zeros(10)

array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [50]:
np.zeros(10, dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [51]:
np.arange(0, 10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [52]:
np.arange(0, 10, 2)

array([0, 2, 4, 6, 8])

In [53]:
np.arange(10, 0, -1)

array([10,  9,  8,  7,  6,  5,  4,  3,  2,  1])

In [54]:
np.linspace(0, 10, 4)

array([  0.        ,   3.33333333,   6.66666667,  10.        ])

In [55]:
a1 = np.arange(0, 10)
a1 * 2

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [56]:
a2 = np.arange(10, 20)
a1 + a2

array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28])

In [57]:
np.array([[1, 2], [3,4]])

array([[1, 2],
       [3, 4]])

In [58]:
m = np.arange(0, 20).reshape(5, 4)
m

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [59]:
np.size(m)

20

In [60]:
np.size(m, 0)

5

In [61]:
np.size(m, 1)

4

In [62]:
m[1, 2]

6

In [63]:
m[1,]

array([4, 5, 6, 7])

In [64]:
m[:, 2]

array([ 2,  6, 10, 14, 18])

In [65]:
a = np.arange(5)
a < 2

array([ True,  True, False, False, False], dtype=bool)

In [66]:
( a < 2 ) | ( a > 3 )

array([ True,  True, False, False,  True], dtype=bool)

In [67]:
def exp (x):
    return x<3 or x>3
np.vectorize(exp)(a)

array([ True,  True,  True, False,  True], dtype=bool)

In [68]:
# Boolean selection
r = a < 3
a[r]

array([0, 1, 2])

In [69]:
np.sum(a < 3)

3

In [70]:
a1 = np.arange(0, 5)
a2 = np.arange(5, 0, -1)
a1 < a2

array([ True,  True,  True, False, False], dtype=bool)

In [71]:
a1 = np.arange(9).reshape(3, 3)
a2 = np.arange(9, 0, -1).reshape(3, 3)
a1 < a2

array([[ True,  True,  True],
       [ True,  True, False],
       [False, False, False]], dtype=bool)

In [72]:
a1 = np.arange(1, 10)
a1[3:8]

array([4, 5, 6, 7, 8])

In [73]:
a1[::2]

array([1, 3, 5, 7, 9])

In [74]:
a1[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1])

In [75]:
a1[9:0:-1]

array([9, 8, 7, 6, 5, 4, 3, 2])

In [76]:
a1[5:]

array([6, 7, 8, 9])

In [77]:
a1[:5]

array([1, 2, 3, 4, 5])

In [78]:
m

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [79]:
# all rows, column position 1
m[:,1]

array([ 1,  5,  9, 13, 17])

In [80]:
m[:,1:3]

array([[ 1,  2],
       [ 5,  6],
       [ 9, 10],
       [13, 14],
       [17, 18]])

In [81]:
m[3:5, :]

array([[12, 13, 14, 15],
       [16, 17, 18, 19]])

In [82]:
m[3:5, 1:3]

array([[13, 14],
       [17, 18]])

In [83]:
m[[1,3,4],:]

array([[ 4,  5,  6,  7],
       [12, 13, 14, 15],
       [16, 17, 18, 19]])

In [84]:
a = np.arange(0, 9)
m = a.reshape(3, 3)
m

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [85]:
reshaped = m.reshape(9)
reshaped

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [86]:
raveled = m.ravel()
raveled

array([0, 1, 2, 3, 4, 5, 6, 7, 8])

In [87]:
m

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [88]:
reshaped = m.reshape(np.size(m))
raveled = m.ravel()
reshaped[2] = 1000
raveled[5] = 2000
m

array([[   0,    1, 1000],
       [   3,    4, 2000],
       [   6,    7,    8]])

In [89]:
# flattened makes a copy
m2 = np.arange(0, 9).reshape(3, 3)
flattened = m2.flatten()
flattened[0] = 1000
flattened

array([1000,    1,    2,    3,    4,    5,    6,    7,    8])

In [90]:
m2

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [91]:
flattened.shape

(9,)

In [92]:
flattened.shape = (3, 3)
flattened

array([[1000,    1,    2],
       [   3,    4,    5],
       [   6,    7,    8]])

In [93]:
flattened.transpose()

array([[1000,    3,    6],
       [   1,    4,    7],
       [   2,    5,    8]])

In [94]:
flattened.T

array([[1000,    3,    6],
       [   1,    4,    7],
       [   2,    5,    8]])

In [95]:
m = np.arange(0, 9).reshape(3, 3)
m.resize(1, 9)
m

array([[0, 1, 2, 3, 4, 5, 6, 7, 8]])

In [96]:
a = np.arange(9).reshape(3, 3)
b = (a + 1) * 10
a

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [97]:
b

array([[10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

In [98]:
np.hstack((a, b))

array([[ 0,  1,  2, 10, 20, 30],
       [ 3,  4,  5, 40, 50, 60],
       [ 6,  7,  8, 70, 80, 90]])

In [99]:
np.concatenate((a, b), axis = 1)

array([[ 0,  1,  2, 10, 20, 30],
       [ 3,  4,  5, 40, 50, 60],
       [ 6,  7,  8, 70, 80, 90]])

In [100]:
np.vstack((a, b))

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

In [101]:
np.concatenate((a, b), axis = 0)

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [10, 20, 30],
       [40, 50, 60],
       [70, 80, 90]])

In [102]:
np.dstack((a, b))

array([[[ 0, 10],
        [ 1, 20],
        [ 2, 30]],

       [[ 3, 40],
        [ 4, 50],
        [ 5, 60]],

       [[ 6, 70],
        [ 7, 80],
        [ 8, 90]]])

In [103]:
one_d_a = np.arange(5)
one_d_a

array([0, 1, 2, 3, 4])

In [104]:
one_d_b = (one_d_a * 10)
one_d_b

array([ 0, 10, 20, 30, 40])

In [105]:
np.column_stack((one_d_a, one_d_b))

array([[ 0,  0],
       [ 1, 10],
       [ 2, 20],
       [ 3, 30],
       [ 4, 40]])

In [106]:
np.row_stack((one_d_a, one_d_b))

array([[ 0,  1,  2,  3,  4],
       [ 0, 10, 20, 30, 40]])

In [107]:
a = np.arange(12).reshape(3, 4)
a

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [108]:
np.hsplit(a, 4)

[array([[0],
        [4],
        [8]]), array([[1],
        [5],
        [9]]), array([[ 2],
        [ 6],
        [10]]), array([[ 3],
        [ 7],
        [11]])]

In [109]:
np.hsplit(a, 2)

[array([[0, 1],
        [4, 5],
        [8, 9]]), array([[ 2,  3],
        [ 6,  7],
        [10, 11]])]

In [110]:
np.hsplit(a, [1, 3])

[array([[0],
        [4],
        [8]]), array([[ 1,  2],
        [ 5,  6],
        [ 9, 10]]), array([[ 3],
        [ 7],
        [11]])]

In [111]:
np.split(a, 2, axis = 1)

[array([[0, 1],
        [4, 5],
        [8, 9]]), array([[ 2,  3],
        [ 6,  7],
        [10, 11]])]

In [112]:
a = np.arange(12).reshape(4, 3)
a

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [113]:
np.vsplit(a, 4)

[array([[0, 1, 2]]),
 array([[3, 4, 5]]),
 array([[6, 7, 8]]),
 array([[ 9, 10, 11]])]

In [114]:
np.vsplit(a, 2)

[array([[0, 1, 2],
        [3, 4, 5]]), array([[ 6,  7,  8],
        [ 9, 10, 11]])]

In [115]:
np.vsplit(a, [1, 3])

[array([[0, 1, 2]]), array([[3, 4, 5],
        [6, 7, 8]]), array([[ 9, 10, 11]])]

In [116]:
np.split(a, 2, axis=0)

[array([[0, 1, 2],
        [3, 4, 5]]), array([[ 6,  7,  8],
        [ 9, 10, 11]])]

In [117]:
c = np.arange(27).reshape(3, 3, 3)
c

array([[[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8]],

       [[ 9, 10, 11],
        [12, 13, 14],
        [15, 16, 17]],

       [[18, 19, 20],
        [21, 22, 23],
        [24, 25, 26]]])

In [118]:
np.dsplit(c, 3)

[array([[[ 0],
         [ 3],
         [ 6]],
 
        [[ 9],
         [12],
         [15]],
 
        [[18],
         [21],
         [24]]]), array([[[ 1],
         [ 4],
         [ 7]],
 
        [[10],
         [13],
         [16]],
 
        [[19],
         [22],
         [25]]]), array([[[ 2],
         [ 5],
         [ 8]],
 
        [[11],
         [14],
         [17]],
 
        [[20],
         [23],
         [26]]])]

In [119]:
m = np.arange(10, 19).reshape(3, 3)
print(m)
print("{0} min of the entire matrix".format(m.min()))
print("{0} max of the entire matrix".format(m.max()))
print("{0} position of the min value".format(m.argmin()))
print("{0} position of the max value".format(m.argmax()))
print("{0} mins down each column".format(m.min(axis = 0)))
print("{0} mins across each row".format(m.min(axis = 1)))
print("{0} maxs down each column".format(m.max(axis = 0)))
print("{0} maxs across each rox".format(m.max(axis = 1)))

[[10 11 12]
 [13 14 15]
 [16 17 18]]
10 min of the entire matrix
18 max of the entire matrix
0 position of the min value
8 position of the max value
[10 11 12] mins down each column
[10 13 16] mins across each row
[16 17 18] maxs down each column
[12 15 18] maxs across each rox


In [120]:
a = np.arange(10)
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [121]:
a.mean(), a.std(), a.var()

(4.5, 2.8722813232690143, 8.25)

In [122]:
a = np.arange(1, 6)
a

array([1, 2, 3, 4, 5])

In [123]:
a.sum(), a.prod()

(15, 120)

In [124]:
a.cumsum(), a.cumprod()

(array([ 1,  3,  6, 10, 15]), array([  1,   2,   6,  24, 120]))

In [125]:
a = np.arange(10)
(a < 5).any() # any < 5?

True

In [126]:
np.arange(10).reshape(2, 5).size

10

## Chapter 4 ##

pandas Series Object

In [127]:
import numpy as np
import pandas as pd

pd.set_option('display.notebook_repr_html', False)
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [128]:
s1 = pd.Series(2)
s1

0    2
dtype: int64

In [129]:
testSeries = pd.Series({"name": "my name", "address":"my address"})
testSeries

address    my address
name          my name
dtype: object

In [130]:
s1[0]

2

In [131]:
testSeries['name']

'my name'

In [132]:
s2 = pd.Series([1, 2, 3, 4, 5])
s2

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [133]:
s2.values

array([1, 2, 3, 4, 5])

In [134]:
s2.index

Int64Index([0, 1, 2, 3, 4], dtype='int64')

In [135]:
s3 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s3

a    1
b    2
c    3
dtype: int64

In [136]:
s3.index

Index(['a', 'b', 'c'], dtype='object')

In [137]:
s4 = pd.Series(2, index=s2.index)
s4

0    2
1    2
2    2
3    2
4    2
dtype: int64

In [138]:
np.random.seed(123456)
pd.Series(np.random.random(5))

0    0.126970
1    0.966718
2    0.260476
3    0.897237
4    0.376750
dtype: float64

In [139]:
pd.Series(np.linspace(0, 90, 10))

0     0
1    10
2    20
3    30
4    40
5    50
6    60
7    70
8    80
9    90
dtype: float64

In [140]:
pd.Series(np.arange(0, 9))

0    0
1    1
2    2
3    3
4    4
5    5
6    6
7    7
8    8
dtype: int64

In [141]:
s6 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 4})
s6

a    1
b    2
c    3
d    4
dtype: int64

Size, shape, uniqueness, and counts

In [142]:
s = pd.Series([0, 1, 1, 2, 3, 4, 5, 6, 7, np.nan])
s

0     0
1     1
2     1
3     2
4     3
5     4
6     5
7     6
8     7
9   NaN
dtype: float64

In [143]:
len(s)

10

In [144]:
s.size

10

In [145]:
s.shape

(10,)

In [146]:
s.count() # does not include Nan

9

In [147]:
s.unique()

array([  0.,   1.,   2.,   3.,   4.,   5.,   6.,   7.,  nan])

In [148]:
s.value_counts() # count of each unique items

1    2
7    1
6    1
5    1
4    1
3    1
2    1
0    1
dtype: int64

Heads, tails, take

In [149]:
s.head()

0    0
1    1
2    1
3    2
4    3
dtype: float64

In [150]:
s.head(3)

0    0
1    1
2    1
dtype: float64

In [151]:
s.tail()

5     4
6     5
7     6
8     7
9   NaN
dtype: float64

In [152]:
s.tail(3)

7     6
8     7
9   NaN
dtype: float64

In [153]:
s.take([0, 3, 9])

0     0
3     2
9   NaN
dtype: float64

In [154]:
s3['a']

1

In [155]:
s3[1]

2

In [156]:
s3[['a', 'c']]

a    1
c    3
dtype: int64

In [157]:
s5 = pd.Series([1, 2, 3], index=[10, 11, 12])
s5

10    1
11    2
12    3
dtype: int64

In [158]:
s5[11]

2

In [159]:
s5.loc[11] # force index-based lookup

2

In [160]:
s5.iloc[1] #force position-based lookup

2

In [161]:
s5.loc[[12, 10]]

12    3
10    1
dtype: int64

In [162]:
s5.iloc[[0, 2]]

10    1
12    3
dtype: int64

In [163]:
s5.loc[[0, 12]] 
#s5.iloc[[0, 12]] # throws exception

# .loc is also faster than .iloc

0    NaN
12     3
dtype: float64

In [164]:
s3

a    1
b    2
c    3
dtype: int64

ix is frowned upon.  use loc and iloc, which is more clear and faster

In [165]:
s3.ix[['a', 'c']]

a    1
c    3
dtype: int64

In [166]:
s3.ix[[1, 2]]

b    2
c    3
dtype: int64

In [167]:
s5.ix[[1, 2, 10, 11]]

1    NaN
2    NaN
10     1
11     2
dtype: float64

In [168]:
s6 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s6

a    1
b    2
c    3
d    4
dtype: int64

In [169]:
s7 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])
s7

d    4
c    3
b    2
a    1
dtype: int64

In [170]:
s6 + s7

a    2
b    4
c    6
d    8
dtype: int64

In [171]:
a1 = np.array([1, 2, 3, 4])
a2 = np.array([4, 3, 2, 1])
a1 + a2

array([5, 5, 5, 5])

In [172]:
s3 * 2

a    2
b    4
c    6
dtype: int64

In [173]:
# not as efficient
t = pd.Series(2, s3.index)
s3 * t

a    2
b    4
c    6
dtype: int64

In [174]:
s8 = pd.Series({'a': 1, 'b': 2, 'c': 3, 'd': 5})
s8

a    1
b    2
c    3
d    5
dtype: int64

In [175]:
s9 = pd.Series({'b': 6, 'c': 7, 'd': 9, 'e': 10})
s9

b     6
c     7
d     9
e    10
dtype: int64

In [176]:
s8 + s9 # if an operator can not be applied, Nan results

a   NaN
b     8
c    10
d    14
e   NaN
dtype: float64

In [177]:
s10 = pd.Series([1.0, 2.0, 3.0], index=['a', 'a', 'b']) 
s10

a    1
a    2
b    3
dtype: float64

In [178]:
s11 = pd.Series([4.0, 5.0, 6.0], index=['a', 'a', 'c'])
s11

a    4
a    5
c    6
dtype: float64

In [179]:
s10 + s11 # by cartesian product, ever combination of labels is calculated

a     5
a     6
a     6
a     7
b   NaN
c   NaN
dtype: float64

Nan

In [180]:
nda = np.array([1, 2, 3, 4, 5])
nda.mean()

3.0

In [181]:
nda = np.array([1, 2, 3, 4, np.NaN])  
nda.mean()

nan

In [182]:
s = pd.Series(nda)
s.mean()

2.5

In [183]:
s.mean(skipna=False) # supposed to give nan

nan

In [184]:
s = pd.Series(np.arange(0, 10))
s > 5

0    False
1    False
2    False
3    False
4    False
5    False
6     True
7     True
8     True
9     True
dtype: bool

In [185]:
logicalResults = s > 5
s[logicalResults]

6    6
7    7
8    8
9    9
dtype: int64

In [186]:
s[ s>5 ] 

6    6
7    7
8    8
9    9
dtype: int64

In [187]:
s[(s > 5) & (s < 8)] # not s[s > 5 and s < 8]

6    6
7    7
dtype: int64

In [188]:
(s >= 0).all()

True

In [189]:
s[s<2].any()

True

In [190]:
(s < 2).any() # alternative syntax

True

In [191]:
s[s<2]

0    0
1    1
dtype: int64

In [192]:
(s < 2).sum() # True = 1, False = 0

2

# Reindexing #

In [193]:
s = pd.Series(np.random.randn(5))
s

0   -0.589863
1   -1.986831
2   -2.173147
3    0.736309
4   -1.066293
dtype: float64

In [194]:
s.index = ['a', 'b', 'c', 'd', 'e']
s

a   -0.589863
b   -1.986831
c   -2.173147
d    0.736309
e   -1.066293
dtype: float64

In [195]:
np.random.seed(123456)
s1 = pd.Series(np.random.randn(3))
s2 = pd.Series(np.random.randn(3))
combined = pd.concat([s1, s2])
combined

0    0.469112
1   -0.282863
2   -1.509059
0   -1.135632
1    1.212112
2   -0.173215
dtype: float64

In [196]:
combined.index = np.arange(0, len(combined))
combined

0    0.469112
1   -0.282863
2   -1.509059
3   -1.135632
4    1.212112
5   -0.173215
dtype: float64

In [197]:
np.random.seed(123456)
s1 = pd.Series(np.random.randn(4), ['a', 'b', 'c', 'd'])
s2 = s1.reindex(['a', 'c', 'g'])
s2              

a    0.469112
c   -1.509059
g         NaN
dtype: float64

In [198]:
s1

a    0.469112
b   -0.282863
c   -1.509059
d   -1.135632
dtype: float64

In [199]:
s1 = pd.Series([0, 1, 2], index=[0, 1, 2])
s2 = pd.Series([3, 4, 5], index=['0', '1', '2'])
s1 + s2

0   NaN
1   NaN
2   NaN
0   NaN
1   NaN
2   NaN
dtype: float64

In [200]:
s2.index.values

array(['0', '1', '2'], dtype=object)

In [201]:
s2.index = s2.index.values.astype(int)
s1 + s2

0    3
1    5
2    7
dtype: int64

In [202]:
s2.index.values

array([0, 1, 2])

In [203]:
s2 = s.copy()
s2.reindex(['a', 'f'], fill_value=0)

a   -0.589863
f    0.000000
dtype: float64

In [204]:
s3 = pd.Series(['red', 'green', 'blue'], index=[0, 3, 5])
s3

0      red
3    green
5     blue
dtype: object

In [205]:
s3.reindex(np.arange(0,7), method='ffill') # fill with last value

0      red
1      red
2      red
3    green
4    green
5     blue
6     blue
dtype: object

In [206]:
s3.reindex(np.arange(0,7), method='bfill') # fill with next value

0      red
1    green
2    green
3    green
4     blue
5     blue
6      NaN
dtype: object

In [207]:
# It is generally preferable to return a new series instead of modifying in place
np.random.seed(123456)
s = pd.Series(np.random.randn(3), index=['a', 'b', 'c'])
s

a    0.469112
b   -0.282863
c   -1.509059
dtype: float64

In [208]:
s['d'] = 100
s

a      0.469112
b     -0.282863
c     -1.509059
d    100.000000
dtype: float64

In [209]:
s['d'] = -100
s

a      0.469112
b     -0.282863
c     -1.509059
d   -100.000000
dtype: float64

In [210]:
del(s['a'])
s

b     -0.282863
c     -1.509059
d   -100.000000
dtype: float64

In [211]:
s = pd.Series(np.arange(100, 110), index=np.arange(10, 20))
s

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
18    108
19    109
dtype: int64

In [212]:
s[0:6:2] 
# start from an include 0, up to but not inclusing 6 and step by 2.

10    100
12    102
14    104
dtype: int64

In [213]:
s.iloc[[0, 2, 4]]

10    100
12    102
14    104
dtype: int64

In [214]:
s[:5] # First 5

10    100
11    101
12    102
13    103
14    104
dtype: int64

In [215]:
s[4:] # fourth from the end

14    104
15    105
16    106
17    107
18    108
19    109
dtype: int64

In [216]:
s[:5:2] # every other in first 5

10    100
12    102
14    104
dtype: int64

In [217]:
s[4::2] # every other starting from fourth

14    104
16    106
18    108
dtype: int64

In [218]:
s[::-1] # reverse

19    109
18    108
17    107
16    106
15    105
14    104
13    103
12    102
11    101
10    100
dtype: int64

In [219]:
s[4::-2] # every other from 4 in reverse

14    104
12    102
10    100
dtype: int64

In [220]:
s[:-2]  # all but last 2

10    100
11    101
12    102
13    103
14    104
15    105
16    106
17    107
dtype: int64

In [221]:
s[-4:-1] # all in last four but last

16    106
17    107
18    108
dtype: int64

In [222]:
copy = s.copy()
slice = copy[:2]
slice

10    100
11    101
dtype: int64

In [223]:
slice[11] = 1000  # changing a slice changes the original
copy

10     100
11    1000
12     102
13     103
14     104
15     105
16     106
17     107
18     108
19     109
dtype: int64

In [224]:
s = pd.Series(np.arange(0, 5), 
              index=['a', 'b', 'c', 'd', 'e'])
s            

a    0
b    1
c    2
d    3
e    4
dtype: int64

In [225]:
s[1:3]

b    1
c    2
dtype: int64

In [226]:
s['b':'d']

b    1
c    2
d    3
dtype: int64