# Pandas Tutorial - Intro to DataFrames
Tutorial by Joe James, https://youtu.be/e60ItwlZTKM

In [30]:
import numpy as np
import pandas as pd

def header(msg):
    print('\n' + '[ ' + msg + ' ]')
    print('-' * 50)

### 1. Load hard-coded data into a dataframe

In [32]:
header("1. Load hard-coded data into a df")
df = pd.DataFrame(
    [['Jan', 58, 42, 74, 22, 2.95],
     ['Feb', 61, 45, 78, 26, 3.02],
     ['Mar', 65, 48, 84, 25, 2.34],
     ['Apr', 67, 50, 92, 28, 1.02],
     ['May', 71, 53, 98, 35, 0.48],
     ['Jun', 75, 56, 107, 41, 0.11],
     ['Jul', 77, 58, 105, 44, 0.0],
     ['Aug', 77, 59, 102, 43, 0.03],
     ['Sep', 77, 57, 103, 40, 0.17],
     ['Oct', 73, 54, 96, 34, 0.81],
     ['Nov', 64, 48, 84, 30, 1.7],
     ['Dec', 58, 42, 73, 21, 2.56]],
    index = [0,1,2,3,4,5,6,7,8,9,10,11],
    columns = ['month', 'avg_high', 'avg_low', 'record_high', 'record_low', 'avg_percip']
    )


[ 1. Load hard-coded data into a df ]
--------------------------------------------------


### 2. Read data from text file into a dataframe
Note: currently there is no text file, simply to keep my folder slim and tidy. First step with hard-coded date is used instead in the following exercises.

In [None]:
header("2. Read data from file into a df")
# below two lines are commented out as txt file does not exist
# filename = 'weather.txt'
# df = pd.read_csv(filename)

In [33]:
print(df)

   month  avg_high  avg_low  record_high  record_low  avg_percip
0    Jan        58       42           74          22        2.95
1    Feb        61       45           78          26        3.02
2    Mar        65       48           84          25        2.34
3    Apr        67       50           92          28        1.02
4    May        71       53           98          35        0.48
5    Jun        75       56          107          41        0.11
6    Jul        77       58          105          44        0.00
7    Aug        77       59          102          43        0.03
8    Sep        77       57          103          40        0.17
9    Oct        73       54           96          34        0.81
10   Nov        64       48           84          30        1.70
11   Dec        58       42           73          21        2.56


### 3. Print first 5 and last 3 rows of df

In [34]:
header("3a. Print first 5 rows")
print(df.head())
header("3b. Print last 3 rows")
print(df.tail(3))


[ 3a. Print first 5 rows ]
--------------------------------------------------
  month  avg_high  avg_low  record_high  record_low  avg_percip
0   Jan        58       42           74          22        2.95
1   Feb        61       45           78          26        3.02
2   Mar        65       48           84          25        2.34
3   Apr        67       50           92          28        1.02
4   May        71       53           98          35        0.48

[ 3b. Print last 3 rows ]
--------------------------------------------------
   month  avg_high  avg_low  record_high  record_low  avg_percip
9    Oct        73       54           96          34        0.81
10   Nov        64       48           84          30        1.70
11   Dec        58       42           73          21        2.56


### 4. Get data types, index, columns, values

In [35]:
header("4a. Get data types")
print(df.dtypes)
header("4b. Get index")
print(df.index)
header("4c. Show columns")
print(df.columns)
header("4d. Show values")
print(df.values)


[ 4a. Get data types ]
--------------------------------------------------
month           object
avg_high         int64
avg_low          int64
record_high      int64
record_low       int64
avg_percip     float64
dtype: object

[ 4b. Get index ]
--------------------------------------------------
Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

[ 4c. Show columns ]
--------------------------------------------------
Index(['month', 'avg_high', 'avg_low', 'record_high', 'record_low',
       'avg_percip'],
      dtype='object')

[ 4d. Show values ]
--------------------------------------------------
[['Jan' 58 42 74 22 2.95]
 ['Feb' 61 45 78 26 3.02]
 ['Mar' 65 48 84 25 2.34]
 ['Apr' 67 50 92 28 1.02]
 ['May' 71 53 98 35 0.48]
 ['Jun' 75 56 107 41 0.11]
 ['Jul' 77 58 105 44 0.0]
 ['Aug' 77 59 102 43 0.03]
 ['Sep' 77 57 103 40 0.17]
 ['Oct' 73 54 96 34 0.81]
 ['Nov' 64 48 84 30 1.7]
 ['Dec' 58 42 73 21 2.56]]


### 5. Statistical summary pf each column

In [36]:
header("5. df.describe() - statistical summary")
print(df.describe())


[ 5. df.describe() - statistical summary ]
--------------------------------------------------
        avg_high    avg_low  record_high  record_low  avg_percip
count  12.000000  12.000000    12.000000   12.000000   12.000000
mean   68.583333  51.000000    91.333333   32.416667    1.265833
std     7.366488   6.060303    12.323911    8.240238    1.186396
min    58.000000  42.000000    73.000000   21.000000    0.000000
25%    63.250000  47.250000    82.500000   25.750000    0.155000
50%    69.000000  51.500000    94.000000   32.000000    0.915000
75%    75.500000  56.250000   102.250000   40.250000    2.395000
max    77.000000  59.000000   107.000000   44.000000    3.020000


### 6. Sort records by any column

In [39]:
header("6. df.sort_values('record_high', 'ascending=False')")
print(df.sort_values('record_high', ascending=False))


[ 6. df.sort_values('record_high', 'ascending=False') ]
--------------------------------------------------
   month  avg_high  avg_low  record_high  record_low  avg_percip
5    Jun        75       56          107          41        0.11
6    Jul        77       58          105          44        0.00
8    Sep        77       57          103          40        0.17
7    Aug        77       59          102          43        0.03
4    May        71       53           98          35        0.48
9    Oct        73       54           96          34        0.81
3    Apr        67       50           92          28        1.02
2    Mar        65       48           84          25        2.34
10   Nov        64       48           84          30        1.70
1    Feb        61       45           78          26        3.02
0    Jan        58       42           74          22        2.95
11   Dec        58       42           73          21        2.56
