# Pandas Tutorial - Intro to DataFrames
Tutorial by Joe James, https://youtu.be/e60ItwlZTKM

In [30]:
import numpy as np
import pandas as pd

def header(msg):
    print('\n' + '[ ' + msg + ' ]')
    print('-' * 50)

### 1. Load hard-coded data into a dataframe

In [32]:
header("1. Load hard-coded data into a df")
df = pd.DataFrame(
    [['Jan', 58, 42, 74, 22, 2.95],
     ['Feb', 61, 45, 78, 26, 3.02],
     ['Mar', 65, 48, 84, 25, 2.34],
     ['Apr', 67, 50, 92, 28, 1.02],
     ['May', 71, 53, 98, 35, 0.48],
     ['Jun', 75, 56, 107, 41, 0.11],
     ['Jul', 77, 58, 105, 44, 0.0],
     ['Aug', 77, 59, 102, 43, 0.03],
     ['Sep', 77, 57, 103, 40, 0.17],
     ['Oct', 73, 54, 96, 34, 0.81],
     ['Nov', 64, 48, 84, 30, 1.7],
     ['Dec', 58, 42, 73, 21, 2.56]],
    index = [0,1,2,3,4,5,6,7,8,9,10,11],
    columns = ['month', 'avg_high', 'avg_low', 'record_high', 'record_low', 'avg_percip']
    )


[ 1. Load hard-coded data into a df ]
--------------------------------------------------


### 2. Read data from text file into a dataframe
Note: currently there is no text file, simply to keep my folder slim and tidy. First step with hard-coded date is used instead in the following exercises.

In [None]:
header("2. Read data from file into a df")
# below two lines are commented out as txt file does not exist
# filename = 'weather.txt'
# df = pd.read_csv(filename)

In [33]:
print(df)

   month  avg_high  avg_low  record_high  record_low  avg_percip
0    Jan        58       42           74          22        2.95
1    Feb        61       45           78          26        3.02
2    Mar        65       48           84          25        2.34
3    Apr        67       50           92          28        1.02
4    May        71       53           98          35        0.48
5    Jun        75       56          107          41        0.11
6    Jul        77       58          105          44        0.00
7    Aug        77       59          102          43        0.03
8    Sep        77       57          103          40        0.17
9    Oct        73       54           96          34        0.81
10   Nov        64       48           84          30        1.70
11   Dec        58       42           73          21        2.56


### 3. Print first 5 and last 3 rows of df

In [34]:
header("3a. Print first 5 rows")
print(df.head())
header("3b. Print last 3 rows")
print(df.tail(3))


[ 3a. Print first 5 rows ]
--------------------------------------------------
  month  avg_high  avg_low  record_high  record_low  avg_percip
0   Jan        58       42           74          22        2.95
1   Feb        61       45           78          26        3.02
2   Mar        65       48           84          25        2.34
3   Apr        67       50           92          28        1.02
4   May        71       53           98          35        0.48

[ 3b. Print last 3 rows ]
--------------------------------------------------
   month  avg_high  avg_low  record_high  record_low  avg_percip
9    Oct        73       54           96          34        0.81
10   Nov        64       48           84          30        1.70
11   Dec        58       42           73          21        2.56


### 4. Get data types, index, columns, values

In [35]:
header("4a. Get data types")
print(df.dtypes)
header("4b. Get index")
print(df.index)
header("4c. Show columns")
print(df.columns)
header("4d. Show values")
print(df.values)


[ 4a. Get data types ]
--------------------------------------------------
month           object
avg_high         int64
avg_low          int64
record_high      int64
record_low       int64
avg_percip     float64
dtype: object

[ 4b. Get index ]
--------------------------------------------------
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

[ 4c. Show columns ]
--------------------------------------------------
Index(['month', 'avg_high', 'avg_low', 'record_high', 'record_low',
       'avg_percip'],
      dtype='object')

[ 4d. Show values ]
--------------------------------------------------
[['Jan' 58 42 74 22 2.95]
 ['Feb' 61 45 78 26 3.02]
 ['Mar' 65 48 84 25 2.34]
 ['Apr' 67 50 92 28 1.02]
 ['May' 71 53 98 35 0.48]
 ['Jun' 75 56 107 41 0.11]
 ['Jul' 77 58 105 44 0.0]
 ['Aug' 77 59 102 43 0.03]
 ['Sep' 77 57 103 40 0.17]
 ['Oct' 73 54 96 34 0.81]
 ['Nov' 64 48 84 30 1.7]
 ['Dec' 58 42 73 21 2.56]]


### 5. Statistical summary pf each column

In [36]:
header("5. df.describe() - statistical summary")
print(df.describe())


[ 5. df.describe() - statistical summary ]
--------------------------------------------------
        avg_high    avg_low  record_high  record_low  avg_percip
count  12.000000  12.000000    12.000000   12.000000   12.000000
mean   68.583333  51.000000    91.333333   32.416667    1.265833
std     7.366488   6.060303    12.323911    8.240238    1.186396
min    58.000000  42.000000    73.000000   21.000000    0.000000
25%    63.250000  47.250000    82.500000   25.750000    0.155000
50%    69.000000  51.500000    94.000000   32.000000    0.915000
75%    75.500000  56.250000   102.250000   40.250000    2.395000
max    77.000000  59.000000   107.000000   44.000000    3.020000


### 6. Sort records by any column

In [39]:
header("6. df.sort_values('record_high', 'ascending=False')")
print(df.sort_values('record_high', ascending=False))


[ 6. df.sort_values('record_high', 'ascending=False') ]
--------------------------------------------------
   month  avg_high  avg_low  record_high  record_low  avg_percip
5    Jun        75       56          107          41        0.11
6    Jul        77       58          105          44        0.00
8    Sep        77       57          103          40        0.17
7    Aug        77       59          102          43        0.03
4    May        71       53           98          35        0.48
9    Oct        73       54           96          34        0.81
3    Apr        67       50           92          28        1.02
2    Mar        65       48           84          25        2.34
10   Nov        64       48           84          30        1.70
1    Feb        61       45           78          26        3.02
0    Jan        58       42           74          22        2.95
11   Dec        58       42           73          21        2.56


### 7. Slicing records

In [74]:
header("7a. Slicing with rows 2 to 3 - df[2:4]")
print(df[2:4])

header("7b. Slicing with single named column - df['avg_low']") 
print(df['avg_low'])

header("7c. Slicing with single named column - df.avg_low - the same as above")
# print(df.avg_low)

header("7d. Slicing with two named columns - df[['avg_low','record_low']]") 
print(df[['avg_low','record_low']]) # double square brackets!

# df.loc - seeks for names of cols
header("7e. Slicing with 'loc' - range of rows and columns - df.loc[0:2, ['avg_low', 'avg_high']]")
print(df.loc[0:2, ['month', 'avg_high']]) # list of cols

header("7f. Slicing with 'loc' single cell - df.loc[2, 'month']")
print(df.loc[2, 'month'])

# df.iloc - seeks for cols indices
header("7g. Slicing with 'iloc' single cell - df.iloc[3, 0]")
print(df.iloc[3, 0])

header("7h. Slicing with 'iloc' range of cells - df.iloc[0:3, 0:4]") # range of cols
print(df.iloc[0:3, 0:4])

header("7i. Slicing with 'iloc' selected cols - df.iloc[0:3, [0,4]]") # selected (list of) cols
print(df.iloc[0:3, [0,4]])


[ 7a. Slicing with rows 2 to 3 - df[2:4] ]
--------------------------------------------------
  month  avg_high  avg_low  record_high  record_low  avg_percip
2   Mar        65       48           84          25        2.34
3   Apr        67       50           92          28        1.02

[ 7b. Slicing with single named column - df['avg_low'] ]
--------------------------------------------------
0     42
1     45
2     48
3     50
4     53
5     56
6     58
7     59
8     57
9     54
10    48
11    42
Name: avg_low, dtype: int64

[ 7c. Slicing with single named column - df.avg_low - the same as above ]
--------------------------------------------------

[ 7d. Slicing with two named columns - df[['avg_low','record_low']] ]
--------------------------------------------------
    avg_low  record_low
0        42          22
1        45          26
2        48          25
3        50          28
4        53          35
5        56          41
6        58          44
7        59          43
8   

### 8. Filtering data

In [81]:
header("8a. Filter on column values - df.df['avg_percip'] > 2.0 or df[df.avg_percip > 2.0]") 
print(df[df['avg_percip'] > 2.0])
# print(df[df.avg_percip > 2.0])

header("8b. Filter on specific value - df[df['month'].isin(['Jun', 'Jul', 'Aug'])]")
print(df[df['month'].isin(['Jun', 'Jul', 'Aug'])]) # list of sought values 


[ 8a. Filter on column values - df.df['avg_percip'] > 2.0 or df[df.avg_percip > 2.0] ]
--------------------------------------------------
   month  avg_high  avg_low  record_high  record_low  avg_percip
0    Jan        58       42           74          22        2.95
1    Feb        61       45           78          26        3.02
2    Mar        65       48           84          25        2.34
11   Dec        58       42           73          21        2.56

[ 8b. Filter on specific value - df[df['month'].isin(['Jun', 'Jul', 'Aug'])] ]
--------------------------------------------------
  month  avg_high  avg_low  record_high  record_low  avg_percip
5   Jun        75       56          107          41        0.11
6   Jul        77       58          105          44        0.00
7   Aug        77       59          102          43        0.03


### 9. Assignment (modifying; overwriting)

In [91]:
header("9a. Overwrite specific cell value - df.loc[0, ['avg_percip']] = 222")
df.loc[0, ['avg_percip']] = 222 # float!
print(df.iloc[0:3])

header("ba. Remove specific cell value (NaN) - df.loc[0, ['avg_percip']] = np.nan")
df.loc[0, ['avg_percip']] = np.nan # NumPy
print(df.iloc[0:3])

header("9c. Overwrite entire col - df.loc[:, ['avg_low']] = np.array([5] * len(df))")
df.loc[:, ['avg_low']] = np.array([5] * len(df)) # '5' is assigned to each row (len(df))
print(df.iloc[0:3])

header("9d. Create a new column - df['avg_day'] = (df.avg_low + df.avg_high) / 2 ")
df['avg_day'] = (df.avg_low + df.avg_high) / 2 
print(df.iloc[0:3])



[ 9a. Overwrite specific cell value - df.loc[0, ['avg_percip']] = 222 ]
--------------------------------------------------
  month  avg_high  avg_low  record_high  record_low  avg_percip
0   Jan        58        5           74          22      222.00
1   Feb        61        5           78          26        3.02
2   Mar        65        5           84          25        2.34

[ ba. Remove specific cell value (NaN) - df.loc[0, ['avg_percip']] = np.nan ]
--------------------------------------------------
  month  avg_high  avg_low  record_high  record_low  avg_percip
0   Jan        58        5           74          22         NaN
1   Feb        61        5           78          26        3.02
2   Mar        65        5           84          25        2.34

[ 9c. Overwrite entire col - df.loc[:, ['avg_low']] = np.array([5] * len(df)) ]
--------------------------------------------------
  month  avg_high  avg_low  record_high  record_low  avg_percip
0   Jan        58        5           7

### 10. Renaming columns

In [106]:
header("10a. Rename columns - df.rename(columns = {'avg_percip':'avg_rain'}, inplace=True)")
df.rename(columns = {'avg_percip':'avg_rain'}, inplace=True)
# df = df.rename(columns = {'avg_percip':'avg_rain'}) # equivalent to the above
print(df.head(2))

header("10b. List of new (all) columns names - df.columns = {'m','av_hi','av_lo','r_hi','r_lo','av_p'}, inplace=True)")
df.columns = ['m','av_hi','av_lo','r_hi','r_lo','av_p','av_d'] # all column names must be given
print(df.head(2))


[ 10a. Rename columns - df.rename(columns = {'avg_percip':'avg_rain'}, inplace=True) ]
--------------------------------------------------
     m  av_hi  av_lo  r_hi  r_lo  av_p  av_d
0  Jan     58      5    74    22   NaN  31.5
1  Feb     61      5    78    26  3.02  33.0

[ 10b. List of new (all) columns names - df.columns = {'m','av_hi','av_lo','r_hi','r_lo','av_p'}, inplace=True) ]
--------------------------------------------------
     m  av_hi  av_lo  r_hi  r_lo  av_p  av_d
0  Jan     58      5    74    22   NaN  31.5
1  Feb     61      5    78    26  3.02  33.0


### 11. Iterate with 'for' loop

In [107]:
header("11. Iterate rows of df with a 'for' loop")
# note: there is a built in function usually better than 'for' loop
for index, row in df.iterrows():
    print(index, row['m'], row['av_hi'])


[ 11. Iterate rows of df with a 'for' loop ]
--------------------------------------------------
0 Jan 58
1 Feb 61
2 Mar 65
3 Apr 67
4 May 71
5 Jun 75
6 Jul 77
7 Aug 77
8 Sep 77
9 Oct 73
10 Nov 64
11 Dec 58


### 12. Write df to a .csv file

In [108]:
# df.to_csv('foo.csv')

2020, AK