In [3]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'2.2.3'

In [4]:
np.__version__

'2.0.2'

### Series - the pandas equivalent of a column of data (like sql) . numpy array column with additional features
series are build on top of numpy array 
index and optional name . 2 or more series together is dataframe

In [6]:
sales = [0,5,155,0,518,0,1827,616,317,325]

sales_series = pd.Series(sales, name="Sales")

sales_series

0       0
1       5
2     155
3       0
4     518
5       0
6    1827
7     616
8     317
9     325
Name: Sales, dtype: int64

# properties of series 
- index
- values
- name
- dtype

In [7]:
sales_series.values

array([   0,    5,  155,    0,  518,    0, 1827,  616,  317,  325])

In [8]:
sales_series.index

RangeIndex(start=0, stop=10, step=1)

In [9]:
sales_series.name

'Sales'

In [10]:
sales_series.dtype

dtype('int64')

In [15]:
array = np.arange(5)

series = pd.Series(array)

In [13]:
pd.Series(np.arange(5), name="Test array")

0    0
1    1
2    2
3    3
4    4
Name: Test array, dtype: int64

In [14]:
pd.Series(np.arange(6).reshape(3,2), name="Test array") # series must be 1 dimentional

ValueError: Data must be 1-dimensional, got ndarray of shape (3, 2) instead

In [16]:
series.values

array([0, 1, 2, 3, 4])

In [17]:
series.values.mean()

np.float64(2.0)

In [18]:
series.mean()

np.float64(2.0)

In [19]:
series.index

RangeIndex(start=0, stop=5, step=1)

In [20]:
series.index = [10, 20 , 30 , 40 , 50]
series 

10    0
20    1
30    2
40    3
50    4
dtype: int64

In [22]:
series.name = 'special series'

In [23]:
series

10    0
20    1
30    2
40    3
50    4
Name: special series, dtype: int64

### pandas data type
64 bit by default , boolean stored in backend as 0 , 1 
- object /text (string , category)
- time series

In [26]:
sales_series

0       0
1       5
2     155
3       0
4     518
5       0
6    1827
7     616
8     317
9     325
Name: Sales, dtype: int64

In [27]:
sales_series.astype("float")

0       0.0
1       5.0
2     155.0
3       0.0
4     518.0
5       0.0
6    1827.0
7     616.0
8     317.0
9     325.0
Name: Sales, dtype: float64

In [28]:
sales_series.astype("bool")

0    False
1     True
2     True
3    False
4     True
5    False
6     True
7     True
8     True
9     True
Name: Sales, dtype: bool

In [29]:
sales_series.astype("datetime64")

ValueError: The 'datetime64' dtype has no unit. Please pass in 'datetime64[ns]' instead.

In [32]:
series = pd.Series(range(5))
series

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [33]:
series.astype("float")

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [34]:
series.astype("bool")

0    False
1     True
2     True
3     True
4     True
dtype: bool

In [35]:
series.astype("object")

0    0
1    1
2    2
3    3
4    4
dtype: object

In [36]:
series.astype("string")

0    0
1    1
2    2
3    3
4    4
dtype: string

In [37]:
series.astype("bool").mean()

np.float64(0.8)

In [38]:
series.astype("bool").sum()

np.int64(4)

In [39]:
series.astype("string").mean() # error

TypeError: Cannot perform reduction 'mean' with string dtype

In [43]:
string_series = pd.Series(['a','b','c'])
string_series

0    a
1    b
2    c
dtype: object

In [44]:
string_series.astype("int") # error

ValueError: invalid literal for int() with base 10: 'a'

### Exercise 1

In [47]:
# create a Dataframe from the oil file, drop missing values
oil = pd.read_csv("./retail/oil.csv").dropna()

oil.head()

Unnamed: 0,date,dcoilwtico
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2
5,2013-01-08,93.21


In [48]:
# grab 100 rows from oil prices 
oil_array = np.array(oil["dcoilwtico"].iloc[1000:1100])
oil_array

array([52.22, 51.44, 51.98, 52.01, 52.82, 54.01, 53.8 , 53.75, 52.36,
       53.26, 53.77, 53.98, 51.95, 50.82, 52.19, 53.01, 52.36, 52.45,
       51.12, 51.39, 52.33, 52.77, 52.38, 52.14, 53.24, 53.18, 52.63,
       52.75, 53.9 , 53.55, 53.81, 53.01, 52.19, 52.37, 52.99, 53.84,
       52.96, 53.21, 53.11, 53.41, 53.41, 54.02, 53.61, 54.48, 53.99,
       54.04, 54.  , 53.82, 52.63, 53.33, 53.19, 52.68, 49.83, 48.75,
       48.05, 47.95, 47.24, 48.34, 48.3 , 48.34, 47.79, 47.02, 47.29,
       47.  , 47.3 , 47.02, 48.36, 49.47, 50.3 , 50.54, 50.25, 50.99,
       51.14, 51.69, 52.25, 53.06, 53.38, 53.12, 53.19, 52.62, 52.46,
       50.49, 50.26, 49.64, 48.9 , 49.22, 49.22, 48.96, 49.31, 48.83,
       47.65, 47.79, 45.55, 46.23, 46.46, 45.84, 47.28, 47.81, 47.83,
       48.86])

In [50]:
# convert oil_array into panda oil_series
oil_series = pd.Series(oil_array, name="oil prices")
oil_series

0     52.22
1     51.44
2     51.98
3     52.01
4     52.82
      ...  
95    45.84
96    47.28
97    47.81
98    47.83
99    48.86
Name: oil prices, Length: 100, dtype: float64

In [52]:
oil_series.values

array([52.22, 51.44, 51.98, 52.01, 52.82, 54.01, 53.8 , 53.75, 52.36,
       53.26, 53.77, 53.98, 51.95, 50.82, 52.19, 53.01, 52.36, 52.45,
       51.12, 51.39, 52.33, 52.77, 52.38, 52.14, 53.24, 53.18, 52.63,
       52.75, 53.9 , 53.55, 53.81, 53.01, 52.19, 52.37, 52.99, 53.84,
       52.96, 53.21, 53.11, 53.41, 53.41, 54.02, 53.61, 54.48, 53.99,
       54.04, 54.  , 53.82, 52.63, 53.33, 53.19, 52.68, 49.83, 48.75,
       48.05, 47.95, 47.24, 48.34, 48.3 , 48.34, 47.79, 47.02, 47.29,
       47.  , 47.3 , 47.02, 48.36, 49.47, 50.3 , 50.54, 50.25, 50.99,
       51.14, 51.69, 52.25, 53.06, 53.38, 53.12, 53.19, 52.62, 52.46,
       50.49, 50.26, 49.64, 48.9 , 49.22, 49.22, 48.96, 49.31, 48.83,
       47.65, 47.79, 45.55, 46.23, 46.46, 45.84, 47.28, 47.81, 47.83,
       48.86])

In [53]:
oil_series.name

'oil prices'

In [54]:
oil_series.index

RangeIndex(start=0, stop=100, step=1)

In [55]:
oil_series.dtype

dtype('float64')

In [58]:
oil_series.size

100

In [56]:
# take the mean of values array
oil_series.values.mean()

np.float64(51.128299999999996)

In [59]:
oil_series.mean()

np.float64(51.128299999999996)

In [57]:
#convert the series to integers and recalculate the mean
oil_series.astype("int").mean()

np.float64(50.66)

### Index 
-- pandas allows custom index

In [61]:
sales = [0,5,155,0,518]
sales_series = pd.Series(sales , name = "Sales")
sales_series

0      0
1      5
2    155
3      0
4    518
Name: Sales, dtype: int64

In [62]:
sales_series[2]

np.int64(155)

In [63]:
sales_series[2:4]

2    155
3      0
Name: Sales, dtype: int64

In [64]:
sales = [0,5,155,0,518]
items = ["coffe","banana","tea","coconut","sugar"]
sales_series = pd.Series(sales ,index=items, name = "Sales")
sales_series

coffe        0
banana       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [65]:
sales_series["tea"]

np.int64(155)

In [67]:
sales_series["banana":"coconut"] # stop point is inclusive

banana       5
tea        155
coconut      0
Name: Sales, dtype: int64

In [68]:
my_series = pd.Series(range(5))

my_series

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [69]:
my_series[3]

np.int64(3)

In [70]:
my_series[1:3]

1    1
2    2
dtype: int64

In [72]:
my_series[1::2] # step 2

1    1
3    3
dtype: int64

In [73]:
my_series = pd.Series(range(5), index = ["Day 0", "Day 1","Day 2","Day 3","Day 4"])

my_series

Day 0    0
Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [74]:
my_series["Day 2"]

np.int64(2)

In [75]:
my_series["Day 2":"Day 4"]

Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [76]:
my_series[::2]

Day 0    0
Day 2    2
Day 4    4
dtype: int64

### iloc method
- efficient then slicing
- works on non integer index
- df.iloc[row position, column position] -- row posiiton - 0 single row , nultiple row , range of row

In [77]:
sales_series

coffe        0
banana       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [79]:
sales_series.iloc[2]

np.int64(155)

In [81]:
sales_series.iloc[2:4] # exclude stop point

tea        155
coconut      0
Name: Sales, dtype: int64

In [82]:
my_series

Day 0    0
Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [83]:
my_series.iloc[2]

np.int64(2)

In [86]:
my_series.iloc[[1,3,4]]

Day 1    1
Day 3    3
Day 4    4
dtype: int64

In [89]:
my_series.iloc[1:]

Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

### loc method
- df.loc[row label , column label] -- single roe , multiple rows , range of rows

In [90]:
sales_series

coffe        0
banana       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [91]:
sales_series.loc["tea"]

np.int64(155)

In [92]:
sales_series.loc["banana":"coconut"]

banana       5
tea        155
coconut      0
Name: Sales, dtype: int64

In [93]:
my_series

Day 0    0
Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [95]:
my_series.loc["Day 2"]

np.int64(2)

In [96]:
my_series["Day 1":"Day 3"]

Day 1    1
Day 2    2
Day 3    3
dtype: int64

In [97]:
my_series.index = [0,2,3,100,5]

In [98]:
my_series

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [105]:
my_series[ my_series != 2]

0      0
2      1
100    3
5      4
dtype: int64

In [99]:
my_series.loc[0:3]

0    0
2    1
3    2
dtype: int64

In [102]:
my_series.loc[0:5] # numbers doesnt mean much in loc 

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [104]:
my_series.reset_index(drop=True).loc[:3]

0    0
1    1
2    2
3    3
dtype: int64

### duplicate index value
-- loc returns all duplicates

In [108]:
sales = [0,5,155,0,518]
items = ["coffee","coffee","tea","coconut","sugar"]
sales_series = pd.Series(sales ,index=items, name = "Sales")
sales_series

coffee       0
coffee       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [109]:
sales_series.loc["coffee"] # returns duplicate values

coffee    0
coffee    5
Name: Sales, dtype: int64

In [110]:
sales_series.reset_index()

Unnamed: 0,index,Sales
0,coffee,0
1,coffee,5
2,tea,155
3,coconut,0
4,sugar,518


In [111]:
sales_series.reset_index(drop=True)

0      0
1      5
2    155
3      0
4    518
Name: Sales, dtype: int64

In [114]:
my_series = pd.Series(range(5), index = ["Day 0", "Day 0","Day 0","Day 2","Day 2"])

my_series

Day 0    0
Day 0    1
Day 0    2
Day 2    3
Day 2    4
dtype: int64

In [115]:
my_series["Day 0"]

Day 0    0
Day 0    1
Day 0    2
dtype: int64

In [116]:
my_series["Day 0"][1]

  my_series["Day 0"][1]


np.int64(1)

In [118]:
my_series.reset_index(drop=True).loc[2:4]

2    2
3    3
4    4
dtype: int64

### Exercise 2

In [119]:
# create a Dataframe from the oil file, drop missing values
oil = pd.read_csv("./retail/oil.csv").dropna()

oil.head()

Unnamed: 0,date,dcoilwtico
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2
5,2013-01-08,93.21


In [124]:
oil_array = np.array(oil["dcoilwtico"].iloc[1000:1100])

date = np.array(oil["date"].iloc[1000:1100])

In [125]:
oil_series = pd.Series(oil_array, index=date, name = "oil_series")

oil_series

2016-12-20    52.22
2016-12-21    51.44
2016-12-22    51.98
2016-12-23    52.01
2016-12-27    52.82
              ...  
2017-05-09    45.84
2017-05-10    47.28
2017-05-11    47.81
2017-05-12    47.83
2017-05-15    48.86
Name: oil_series, Length: 100, dtype: float64

In [139]:
# mean of first 10 
oil_series[1:11].mean()

np.float64(52.92)

In [132]:
oil_series.iloc[:10].mean()

np.float64(52.765)

In [133]:
# mean of last 10 elements 
oil_series.iloc[-10:].mean()

np.float64(47.13)

In [135]:
# grab all prices from jan 1st 2017- jan 7th 2017 (inclusive) and reset the index to integers
oil_series.loc["2017-01-01":"2017-01-07"]

2017-01-03    52.36
2017-01-04    53.26
2017-01-05    53.77
2017-01-06    53.98
Name: oil_series, dtype: float64

In [136]:
oil_series.loc["2017-01-01":"2017-01-07"].reset_index()

Unnamed: 0,index,oil_series
0,2017-01-03,52.36
1,2017-01-04,53.26
2,2017-01-05,53.77
3,2017-01-06,53.98


In [137]:
oil_series.loc["2017-01-01":"2017-01-07"].reset_index(drop=True)

0    52.36
1    53.26
2    53.77
3    53.98
Name: oil_series, dtype: float64

### Filtering series
- pass logical test into a .loc[] accessor
- == .eq()
- != .ne()
- in .isin()
- not in ~.isin()

In [140]:
sales_series

coffee       0
coffee       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [141]:
sales_series.loc[sales_series > 0] # sales_series > 0 return bollean array

coffee      5
tea       155
sugar     518
Name: Sales, dtype: int64

In [143]:
mask = (sales_series > 0) & (sales_series.index == "coffee")

sales_series[mask]

coffee    5
Name: Sales, dtype: int64

### logical operators and methods 
- operators and methods
- methods are good for chaining

In [144]:
sales_series

coffee       0
coffee       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [146]:
sales_series == 5 # returns array 

coffee     False
coffee      True
tea        False
coconut    False
sugar      False
Name: Sales, dtype: bool

In [149]:
sales_series.ge(5)

coffee     False
coffee      True
tea         True
coconut    False
sugar       True
Name: Sales, dtype: bool

In [151]:
sales_series.ge(5).mean() # chaining operation

np.float64(0.6)

In [152]:
# membership test 
sales_series

coffee       0
coffee       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [153]:
sales_series.index.isin(["coffee","tea"])

array([ True,  True,  True, False, False])

In [156]:
~sales_series.index.isin(["coffee","tea"]) # invert booleanvalues

array([False, False, False,  True,  True])

In [157]:
my_series

Day 0    0
Day 0    1
Day 0    2
Day 2    3
Day 2    4
dtype: int64

In [158]:
my_series == 2

Day 0    False
Day 0    False
Day 0     True
Day 2    False
Day 2    False
dtype: bool

In [159]:
my_series !=2

Day 0     True
Day 0     True
Day 0    False
Day 2     True
Day 2     True
dtype: bool

In [162]:
my_series.loc[my_series !=2]

Day 0    0
Day 0    1
Day 2    3
Day 2    4
dtype: int64

In [163]:
my_series.loc[~(my_series !=2)]

Day 0    2
dtype: int64

In [166]:
my_series.loc[my_series.isin([1,2])]

Day 0    1
Day 0    2
dtype: int64

In [167]:
my_series.loc[~my_series.isin([1,2])]

Day 0    0
Day 2    3
Day 2    4
dtype: int64

In [168]:
my_series.loc[(my_series > 2)]

Day 2    3
Day 2    4
dtype: int64

In [169]:
my_series.loc[~(my_series > 2)] # parenthesis required

Day 0    0
Day 0    1
Day 0    2
dtype: int64

In [170]:
my_series.loc[~my_series.gt(2)] # parenthesis not required

Day 0    0
Day 0    1
Day 0    2
dtype: int64

In [172]:
my_series

Day 0    0
Day 0    1
Day 0    2
Day 2    3
Day 2    4
dtype: int64

In [180]:
mask = (my_series.isin([1,2])) | (my_series > 2)

In [181]:
my_series.loc[mask]

Day 0    1
Day 0    2
Day 2    3
Day 2    4
dtype: int64

In [176]:
mask = (my_series.isin([1,2])) & (my_series > 2)

In [178]:
my_series.loc[mask]

Series([], dtype: int64)

In [182]:
my_series in [1,2]

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [184]:
my_series.isin([1,2] )

Day 0    False
Day 0     True
Day 0     True
Day 2    False
Day 2    False
dtype: bool

### sorting series
- by their values
- or by index

In [185]:
sales_series.sort_values()

coffee       0
coconut      0
coffee       5
tea        155
sugar      518
Name: Sales, dtype: int64

In [186]:
sales_series.sort_values(ascending=False)

sugar      518
tea        155
coffee       5
coffee       0
coconut      0
Name: Sales, dtype: int64

In [188]:
sales_series.sort_index()

coconut      0
coffee       0
coffee       5
sugar      518
tea        155
Name: Sales, dtype: int64

In [189]:
sales_series.sort_index(ascending=False)

tea        155
sugar      518
coffee       0
coffee       5
coconut      0
Name: Sales, dtype: int64

In [190]:
my_series

Day 0    0
Day 0    1
Day 0    2
Day 2    3
Day 2    4
dtype: int64

In [191]:
my_series.sort_values()

Day 0    0
Day 0    1
Day 0    2
Day 2    3
Day 2    4
dtype: int64

In [192]:
my_series.sort_values(ascending=False)

Day 2    4
Day 2    3
Day 0    2
Day 0    1
Day 0    0
dtype: int64

In [193]:
my_series2 = my_series.sort_values(ascending=False)

In [194]:
my_series2

Day 2    4
Day 2    3
Day 0    2
Day 0    1
Day 0    0
dtype: int64

In [195]:
my_series

Day 0    0
Day 0    1
Day 0    2
Day 2    3
Day 2    4
dtype: int64

In [196]:
my_series.sort_values(ascending=False, inplace=True)

In [197]:
my_series

Day 2    4
Day 2    3
Day 0    2
Day 0    1
Day 0    0
dtype: int64

In [198]:
my_series.sort_index()

Day 0    2
Day 0    1
Day 0    0
Day 2    4
Day 2    3
dtype: int64

In [199]:
my_series.sort_index(ascending=False)

Day 2    4
Day 2    3
Day 0    2
Day 0    1
Day 0    0
dtype: int64

### Exercise 3

In [200]:
oil_series

2016-12-20    52.22
2016-12-21    51.44
2016-12-22    51.98
2016-12-23    52.01
2016-12-27    52.82
              ...  
2017-05-09    45.84
2017-05-10    47.28
2017-05-11    47.81
2017-05-12    47.83
2017-05-15    48.86
Name: oil_series, Length: 100, dtype: float64

In [203]:
oil_series.sort_values().iloc[:10]

2017-05-04    45.55
2017-05-09    45.84
2017-05-05    46.23
2017-05-08    46.46
2017-03-23    47.00
2017-03-21    47.02
2017-03-27    47.02
2017-03-14    47.24
2017-05-10    47.28
2017-03-22    47.29
Name: oil_series, dtype: float64

In [205]:
# get 10 lowest prices sort them by date starting with most recent and ending with the oldest
oil_series.sort_values().iloc[:10].sort_index(ascending=False)

2017-05-10    47.28
2017-05-09    45.84
2017-05-08    46.46
2017-05-05    46.23
2017-05-04    45.55
2017-03-27    47.02
2017-03-23    47.00
2017-03-22    47.29
2017-03-21    47.02
2017-03-14    47.24
Name: oil_series, dtype: float64

In [208]:
# use date list . select only rows with these dates and had price of less than 50 $ per barrel
dates = [
    "2016-12-22",
    "2017-05-03",
    "2017-01-06",
    "2017-03-05",
    "2017-02-12",
    "2017-03-21",
    "2017-04-14",
    "2017-04-15",
]

In [206]:
oil_series

2016-12-20    52.22
2016-12-21    51.44
2016-12-22    51.98
2016-12-23    52.01
2016-12-27    52.82
              ...  
2017-05-09    45.84
2017-05-10    47.28
2017-05-11    47.81
2017-05-12    47.83
2017-05-15    48.86
Name: oil_series, Length: 100, dtype: float64

In [209]:
oil_series.isin(dates)

2016-12-20    False
2016-12-21    False
2016-12-22    False
2016-12-23    False
2016-12-27    False
              ...  
2017-05-09    False
2017-05-10    False
2017-05-11    False
2017-05-12    False
2017-05-15    False
Name: oil_series, Length: 100, dtype: bool

In [215]:
mask = (oil_series.index.isin(dates)) #& (oil_series <= 50)

In [216]:
oil_series.loc[mask]

2016-12-22    51.98
2017-01-06    53.98
2017-03-21    47.02
2017-05-03    47.79
Name: oil_series, dtype: float64

In [217]:
mask = (oil_series.index.isin(dates)) & (oil_series <= 50)

In [218]:
oil_series.loc[mask]

2017-03-21    47.02
2017-05-03    47.79
Name: oil_series, dtype: float64

### arithmetic operators & methods
python operator , pandas method 
- + .add()
- - .sub()
- * .mul()
- / .div()
- // .floordiv()
- % .mod()
- ** .pow()

In [221]:
monday_sales = pd.Series([0,5,155,0,518], name="monday_sales")

In [222]:
monday_sales

0      0
1      5
2    155
3      0
4    518
Name: monday_sales, dtype: int64

In [223]:
monday_sales + 2

0      2
1      7
2    157
3      2
4    520
Name: monday_sales, dtype: int64

In [224]:
monday_sales.add(2)

0      2
1      7
2    157
3      2
4    520
Name: monday_sales, dtype: int64

In [225]:
"$" + monday_sales.astype("float").astype("string")

0      $0.0
1      $5.0
2    $155.0
3      $0.0
4    $518.0
Name: monday_sales, dtype: string

In [228]:
my_series = pd.Series([1,np.nan, 2, 3, 4] , index=["day 0", "day 1","day 2","day 3","day 4"])

my_series

day 0    1.0
day 1    NaN
day 2    2.0
day 3    3.0
day 4    4.0
dtype: float64

In [229]:
my_series + 1

day 0    2.0
day 1    NaN
day 2    3.0
day 3    4.0
day 4    5.0
dtype: float64

In [231]:
my_series2 = my_series.add(1 , fill_value=0).astype("int")

In [233]:
my_series2 / 2

day 0    1.0
day 1    0.5
day 2    1.5
day 3    2.0
day 4    2.5
dtype: float64

In [234]:
my_series2 * 2

day 0     4
day 1     2
day 2     6
day 3     8
day 4    10
dtype: int64

In [235]:
my_series2 // 2

day 0    1
day 1    0
day 2    1
day 3    2
day 4    2
dtype: int64

In [236]:
my_series + my_series2

day 0    3.0
day 1    NaN
day 2    5.0
day 3    7.0
day 4    9.0
dtype: float64

In [237]:
my_series.add(my_series2, fill_value=0)

day 0    3.0
day 1    1.0
day 2    5.0
day 3    7.0
day 4    9.0
dtype: float64

In [238]:
(my_series + my_series2 * 2)/2

day 0    2.5
day 1    NaN
day 2    4.0
day 3    5.5
day 4    7.0
dtype: float64

### string methods 
- .strip(), .lstrip, .rstrip()
- .upper(), .lower()
- .slice(start:stop:step)
- .count("string")
- .contains("string")
- .replace("a","b")
- .split("delimiter",expand=True)
- .len()
- .startswith("string"),.endwith("string")

In [240]:
prices = pd.Series(["$3.99","$5.99","$22.99","$7.99","$33.99"])

prices

0     $3.99
1     $5.99
2    $22.99
3     $7.99
4    $33.99
dtype: object

In [241]:
prices.str.contains("3")

0     True
1    False
2    False
3    False
4     True
dtype: bool

In [242]:
prices.str.strip("$").astype("float")

0     3.99
1     5.99
2    22.99
3     7.99
4    33.99
dtype: float64

In [245]:
string_series = pd.Series(["day 0", "day 1","day 2","day 3","day 4"])

string_series

0    day 0
1    day 1
2    day 2
3    day 3
4    day 4
dtype: object

In [246]:
string_series.str.contains("1")

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [247]:
string_series.str.upper() #contains("1")

0    DAY 0
1    DAY 1
2    DAY 2
3    DAY 3
4    DAY 4
dtype: object

In [248]:
string_series.str.upper().str.contains("DAY 1")

0    False
1     True
2    False
3    False
4    False
dtype: bool

In [249]:
string_series.str.strip('day').astype('int')

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [250]:
string_series.str[-1].astype('int')

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [251]:
string_series.str[1:3]

0    ay
1    ay
2    ay
3    ay
4    ay
dtype: object

In [252]:
string_series.str.split(' ')

0    [day, 0]
1    [day, 1]
2    [day, 2]
3    [day, 3]
4    [day, 4]
dtype: object

In [253]:
string_series.str.split(' ', expand=True)

Unnamed: 0,0,1
0,day,0
1,day,1
2,day,2
3,day,3
4,day,4


### Exercise 4 

In [264]:
# increase the prices in the oil series by 10% and then add an additional 2 dollars / barrel on top of that
oil_series

2016-12-20    52.22
2016-12-21    51.44
2016-12-22    51.98
2016-12-23    52.01
2016-12-27    52.82
              ...  
2017-05-09    45.84
2017-05-10    47.28
2017-05-11    47.81
2017-05-12    47.83
2017-05-15    48.86
Name: oil_series, Length: 100, dtype: float64

In [256]:
new_oil_series = (oil_series + (oil_series * .1 )) + 2

new_oil_series

2016-12-20    59.442
2016-12-21    58.584
2016-12-22    59.178
2016-12-23    59.211
2016-12-27    60.102
               ...  
2017-05-09    52.424
2017-05-10    54.008
2017-05-11    54.591
2017-05-12    54.613
2017-05-15    55.746
Name: oil_series, Length: 100, dtype: float64

In [259]:
# create a series that represents the % diff bet each price in the original oil_series and max_price
max_oil_price = oil_series.max()

max_oil_price

np.float64(54.48)

In [272]:
(oil_series - max_oil_price)/max_oil_price # % difference 

2016-12-20   -0.041483
2016-12-21   -0.055800
2016-12-22   -0.045888
2016-12-23   -0.045338
2016-12-27   -0.030470
                ...   
2017-05-09   -0.158590
2017-05-10   -0.132159
2017-05-11   -0.122430
2017-05-12   -0.122063
2017-05-15   -0.103157
Name: oil_series, Length: 100, dtype: float64

In [271]:
# extract month from the string dates and store them as integers
oil_series.index.str.slice(5,7).astype('int')

Index([12, 12, 12, 12, 12, 12, 12, 12,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,  2,  2,  2,  2,  2,  2,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,
        3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,
        4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,
        5,  5,  5,  5,  5,  5,  5,  5,  5,  5],
      dtype='int64')

In [None]:
oil_series.index.str.slice(5,7).astype('int')