In [3]:
import pandas as pd
import numpy as np

In [2]:
pd.__version__

'2.2.3'

In [4]:
np.__version__

'2.0.2'

### Series - the pandas equivalent of a column of data (like sql) . numpy array column with additional features
series are build on top of numpy array 
index and optional name . 2 or more series together is dataframe

In [6]:
sales = [0,5,155,0,518,0,1827,616,317,325]

sales_series = pd.Series(sales, name="Sales")

sales_series

0       0
1       5
2     155
3       0
4     518
5       0
6    1827
7     616
8     317
9     325
Name: Sales, dtype: int64

# properties of series 
- index
- values
- name
- dtype

In [7]:
sales_series.values

array([   0,    5,  155,    0,  518,    0, 1827,  616,  317,  325])

In [8]:
sales_series.index

RangeIndex(start=0, stop=10, step=1)

In [9]:
sales_series.name

'Sales'

In [10]:
sales_series.dtype

dtype('int64')

In [15]:
array = np.arange(5)

series = pd.Series(array)

In [13]:
pd.Series(np.arange(5), name="Test array")

0    0
1    1
2    2
3    3
4    4
Name: Test array, dtype: int64

In [14]:
pd.Series(np.arange(6).reshape(3,2), name="Test array") # series must be 1 dimentional

ValueError: Data must be 1-dimensional, got ndarray of shape (3, 2) instead

In [16]:
series.values

array([0, 1, 2, 3, 4])

In [17]:
series.values.mean()

np.float64(2.0)

In [18]:
series.mean()

np.float64(2.0)

In [19]:
series.index

RangeIndex(start=0, stop=5, step=1)

In [20]:
series.index = [10, 20 , 30 , 40 , 50]
series 

10    0
20    1
30    2
40    3
50    4
dtype: int64

In [22]:
series.name = 'special series'

In [23]:
series

10    0
20    1
30    2
40    3
50    4
Name: special series, dtype: int64

### pandas data type
64 bit by default , boolean stored in backend as 0 , 1 
- object /text (string , category)
- time series

In [26]:
sales_series

0       0
1       5
2     155
3       0
4     518
5       0
6    1827
7     616
8     317
9     325
Name: Sales, dtype: int64

In [27]:
sales_series.astype("float")

0       0.0
1       5.0
2     155.0
3       0.0
4     518.0
5       0.0
6    1827.0
7     616.0
8     317.0
9     325.0
Name: Sales, dtype: float64

In [28]:
sales_series.astype("bool")

0    False
1     True
2     True
3    False
4     True
5    False
6     True
7     True
8     True
9     True
Name: Sales, dtype: bool

In [29]:
sales_series.astype("datetime64")

ValueError: The 'datetime64' dtype has no unit. Please pass in 'datetime64[ns]' instead.

In [32]:
series = pd.Series(range(5))
series

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [33]:
series.astype("float")

0    0.0
1    1.0
2    2.0
3    3.0
4    4.0
dtype: float64

In [34]:
series.astype("bool")

0    False
1     True
2     True
3     True
4     True
dtype: bool

In [35]:
series.astype("object")

0    0
1    1
2    2
3    3
4    4
dtype: object

In [36]:
series.astype("string")

0    0
1    1
2    2
3    3
4    4
dtype: string

In [37]:
series.astype("bool").mean()

np.float64(0.8)

In [38]:
series.astype("bool").sum()

np.int64(4)

In [39]:
series.astype("string").mean() # error

TypeError: Cannot perform reduction 'mean' with string dtype

In [43]:
string_series = pd.Series(['a','b','c'])
string_series

0    a
1    b
2    c
dtype: object

In [44]:
string_series.astype("int") # error

ValueError: invalid literal for int() with base 10: 'a'

### Exercise 1

In [47]:
# create a Dataframe from the oil file, drop missing values
oil = pd.read_csv("./retail/oil.csv").dropna()

oil.head()

Unnamed: 0,date,dcoilwtico
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.2
5,2013-01-08,93.21


In [48]:
# grab 100 rows from oil prices 
oil_array = np.array(oil["dcoilwtico"].iloc[1000:1100])
oil_array

array([52.22, 51.44, 51.98, 52.01, 52.82, 54.01, 53.8 , 53.75, 52.36,
       53.26, 53.77, 53.98, 51.95, 50.82, 52.19, 53.01, 52.36, 52.45,
       51.12, 51.39, 52.33, 52.77, 52.38, 52.14, 53.24, 53.18, 52.63,
       52.75, 53.9 , 53.55, 53.81, 53.01, 52.19, 52.37, 52.99, 53.84,
       52.96, 53.21, 53.11, 53.41, 53.41, 54.02, 53.61, 54.48, 53.99,
       54.04, 54.  , 53.82, 52.63, 53.33, 53.19, 52.68, 49.83, 48.75,
       48.05, 47.95, 47.24, 48.34, 48.3 , 48.34, 47.79, 47.02, 47.29,
       47.  , 47.3 , 47.02, 48.36, 49.47, 50.3 , 50.54, 50.25, 50.99,
       51.14, 51.69, 52.25, 53.06, 53.38, 53.12, 53.19, 52.62, 52.46,
       50.49, 50.26, 49.64, 48.9 , 49.22, 49.22, 48.96, 49.31, 48.83,
       47.65, 47.79, 45.55, 46.23, 46.46, 45.84, 47.28, 47.81, 47.83,
       48.86])

In [50]:
# convert oil_array into panda oil_series
oil_series = pd.Series(oil_array, name="oil prices")
oil_series

0     52.22
1     51.44
2     51.98
3     52.01
4     52.82
      ...  
95    45.84
96    47.28
97    47.81
98    47.83
99    48.86
Name: oil prices, Length: 100, dtype: float64

In [52]:
oil_series.values

array([52.22, 51.44, 51.98, 52.01, 52.82, 54.01, 53.8 , 53.75, 52.36,
       53.26, 53.77, 53.98, 51.95, 50.82, 52.19, 53.01, 52.36, 52.45,
       51.12, 51.39, 52.33, 52.77, 52.38, 52.14, 53.24, 53.18, 52.63,
       52.75, 53.9 , 53.55, 53.81, 53.01, 52.19, 52.37, 52.99, 53.84,
       52.96, 53.21, 53.11, 53.41, 53.41, 54.02, 53.61, 54.48, 53.99,
       54.04, 54.  , 53.82, 52.63, 53.33, 53.19, 52.68, 49.83, 48.75,
       48.05, 47.95, 47.24, 48.34, 48.3 , 48.34, 47.79, 47.02, 47.29,
       47.  , 47.3 , 47.02, 48.36, 49.47, 50.3 , 50.54, 50.25, 50.99,
       51.14, 51.69, 52.25, 53.06, 53.38, 53.12, 53.19, 52.62, 52.46,
       50.49, 50.26, 49.64, 48.9 , 49.22, 49.22, 48.96, 49.31, 48.83,
       47.65, 47.79, 45.55, 46.23, 46.46, 45.84, 47.28, 47.81, 47.83,
       48.86])

In [53]:
oil_series.name

'oil prices'

In [54]:
oil_series.index

RangeIndex(start=0, stop=100, step=1)

In [55]:
oil_series.dtype

dtype('float64')

In [58]:
oil_series.size

100

In [56]:
# take the mean of values array
oil_series.values.mean()

np.float64(51.128299999999996)

In [59]:
oil_series.mean()

np.float64(51.128299999999996)

In [57]:
#convert the series to integers and recalculate the mean
oil_series.astype("int").mean()

np.float64(50.66)

### Index 
-- pandas allows custom index

In [61]:
sales = [0,5,155,0,518]
sales_series = pd.Series(sales , name = "Sales")
sales_series

0      0
1      5
2    155
3      0
4    518
Name: Sales, dtype: int64

In [62]:
sales_series[2]

np.int64(155)

In [63]:
sales_series[2:4]

2    155
3      0
Name: Sales, dtype: int64

In [64]:
sales = [0,5,155,0,518]
items = ["coffe","banana","tea","coconut","sugar"]
sales_series = pd.Series(sales ,index=items, name = "Sales")
sales_series

coffe        0
banana       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [65]:
sales_series["tea"]

np.int64(155)

In [67]:
sales_series["banana":"coconut"] # stop point is inclusive

banana       5
tea        155
coconut      0
Name: Sales, dtype: int64

In [68]:
my_series = pd.Series(range(5))

my_series

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [69]:
my_series[3]

np.int64(3)

In [70]:
my_series[1:3]

1    1
2    2
dtype: int64

In [72]:
my_series[1::2] # step 2

1    1
3    3
dtype: int64

In [73]:
my_series = pd.Series(range(5), index = ["Day 0", "Day 1","Day 2","Day 3","Day 4"])

my_series

Day 0    0
Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [74]:
my_series["Day 2"]

np.int64(2)

In [75]:
my_series["Day 2":"Day 4"]

Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [76]:
my_series[::2]

Day 0    0
Day 2    2
Day 4    4
dtype: int64

### iloc method
- efficient then slicing
- works on non integer index
- df.iloc[row position, column position] -- row posiiton - 0 single row , nultiple row , range of row

In [77]:
sales_series

coffe        0
banana       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [79]:
sales_series.iloc[2]

np.int64(155)

In [81]:
sales_series.iloc[2:4] # exclude stop point

tea        155
coconut      0
Name: Sales, dtype: int64

In [82]:
my_series

Day 0    0
Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [83]:
my_series.iloc[2]

np.int64(2)

In [86]:
my_series.iloc[[1,3,4]]

Day 1    1
Day 3    3
Day 4    4
dtype: int64

In [89]:
my_series.iloc[1:]

Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

### loc method
- df.loc[row label , column label] -- single roe , multiple rows , range of rows

In [90]:
sales_series

coffe        0
banana       5
tea        155
coconut      0
sugar      518
Name: Sales, dtype: int64

In [91]:
sales_series.loc["tea"]

np.int64(155)

In [92]:
sales_series.loc["banana":"coconut"]

banana       5
tea        155
coconut      0
Name: Sales, dtype: int64

In [93]:
my_series

Day 0    0
Day 1    1
Day 2    2
Day 3    3
Day 4    4
dtype: int64

In [95]:
my_series.loc["Day 2"]

np.int64(2)

In [96]:
my_series["Day 1":"Day 3"]

Day 1    1
Day 2    2
Day 3    3
dtype: int64

In [97]:
my_series.index = [0,2,3,100,5]

In [98]:
my_series

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [99]:
my_series.loc[0:3]

0    0
2    1
3    2
dtype: int64

In [102]:
my_series.loc[0:5] # numbers doesnt mean much in loc 

0      0
2      1
3      2
100    3
5      4
dtype: int64

In [104]:
my_series.reset_index(drop=True).loc[:3]

0    0
1    1
2    2
3    3
dtype: int64