In [1]:
import numpy as np

# Quick Introduction: Variables, Strings, and Arrays

In [5]:
# Setting a variable

# Setting a variable can be integers, floats, or string...you don't have to specify!
a = 5
b = 10.0
c = 'hello'

print('int',a)
print('float',b)
print('string',c)


int 5
float 10.0
string hello


In [8]:
# math with variables

# you can do math between floats and strings...Python will keep the highest precision!

test1 = a+b
test2 = a+5 # this will stay integer!
test3 = a+5.0

print(test)
print(test2)
print(test3)

15.0
10
10.0


### Arrays

In [15]:
# The Numpy Python package makes working with arrays incredibly easy. 

# create random array
arr = np.random.rand(10)
print(arr)

[0.39302983 0.31754156 0.75559002 0.96609151 0.72067645 0.6916647
 0.61050651 0.4849132  0.31543131 0.67851609]


In [17]:
# Math with arrays 

# you can add/multiply a scalar value to an array and it will be applied to all entries
arr_add = arr + 1.0
print(arr_add)
print('')

arr_mult = arr * 2.0
print(arr_mult)

[1.39302983 1.31754156 1.75559002 1.96609151 1.72067645 1.6916647
 1.61050651 1.4849132  1.31543131 1.67851609]

[0.78605965 0.63508312 1.51118004 1.93218301 1.4413529  1.38332939
 1.22101302 0.9698264  0.63086262 1.35703219]


In [27]:
# you can also add/multiply two arrays together...but they must be the same shape!

#create an array of integers
ints = np.arange(0,10)
print(ints)
print('')

# numpy makes it easy to see the "shape" of each array
print(arr.shape, ints.shape) #<-- each of these arrays have 10 entries
print('')

# now add them together
arr_plus_int = arr + ints
print(arr_plus_int)

[0 1 2 3 4 5 6 7 8 9]

(10,) (10,)

[0.39302983 1.31754156 2.75559002 3.96609151 4.72067645 5.6916647
 6.61050651 7.4849132  8.31543131 9.67851609]


**Activity #1**

Find the average between two arrays "a" and "b", and assign it to the variable "ab_mean" (important: do not find the mean of EACH array, instead create a new array that is the average between the two given)

In [None]:
a = np.asarray([2,5,8,3,6])
b = np.asarray([4,3,2,3,10])

# code your solution below





### Array Statistics

In [38]:
# Numpy makes it SUPER easy to find the statistics of different arrays
rand = np.random.randint(0,50,size=30).astype(float)
print(rand)

[27. 21.  1.  1. 11. 40. 29. 19. 19. 13. 12. 38. 49. 31. 17. 38. 42. 30.
  0. 33. 45. 29. 38. 36.  9. 44. 47. 48. 45.  4.]


In [39]:
# max and min
print('max',np.max(rand),'min',np.min(rand))

# mean
print('mean',np.mean(rand))

# standard deviation
print('standard deviation',np.std(rand))

max 49.0 min 0.0
mean 27.2
standard deviation 15.321879780235843


In [45]:
# UH OH! Our dataset all of a sudden has NaNs! 
rand[5] = np.nan
print(rand)
print('')

# when you do python math with a nan, there's trouble
print('Oh no!' ,1.0 + np.nan)
print('')

# cue dramatic music
print(np.max(rand),np.min(rand),np.mean(rand),np.std(rand))

[27. 21.  1.  1. 11. nan 29. 19. 19. 13. 12. 38. 49. 31. 17. 38. 42. 30.
  0. 33. 45. 29. 38. 36.  9. 44. 47. 48. 45.  4.]

Oh no! nan

nan nan nan nan


In [46]:
# don't worry...there's an easy fix

# max and min
print('max',np.nanmax(rand),'min',np.nanmin(rand))

# mean
print('mean',np.nanmean(rand))

# standard deviation
print('standard deviation',np.nanstd(rand))

max 49.0 min 0.0
mean 26.75862068965517
standard deviation 15.39515188446606


Numpy has quite a bit of statistical options. See the link below for additional documentation:

https://numpy.org/doc/stable/reference/routines.statistics.html

### Indexing Arrays

Array "indexing" allows us to pull out only a section of an array! But here's the kicker: Python is a 0-based array system! Meaning the first entry is "the 0th entry".

In [48]:
# Grab the first entry in an array:
print(rand[0])

# Grab the last entry in an array:
print(rand[-1])

27.0
4.0


In [51]:
# grab the first four entries in an array:
print(rand[:4])

# grab the last four entries in an array:
print(rand[-4:])

[27. 21.  1.  1.]
[47. 48. 45.  4.]


Another great thing about Numpy arrays is that you can ask for entries in an array that meet a certain condition! This makes it really handy when you have data that isn't QC'd yet

In [53]:
# Here, we have an array with 999.0 data in it
data = np.asarray([4.,5.,3.2,999.0,6.,5.4,7.8,9.2,999.0,3.1,6.,8.])
print(data)
print('')

# this will really muddy up our statistics
print('max',np.max(data),'min',np.min(data),'mean',np.mean(data))

[  4.    5.    3.2 999.    6.    5.4   7.8   9.2 999.    3.1   6.    8. ]

max 999.0 min 3.1 mean 171.3083333333333


In [57]:
# The best way to take care of these 999 issues is to replace them with NaNs

# First, set a condition: we know that this dataset uses 999 as bad data. So our condition will be this:
condition = data==999.0
print(condition)


[False False False  True False False False False  True False False False]


In [58]:
# Now, we can index the array with that "condition"
print(data[condition])

[999. 999.]


In [60]:
# We can assign NEW ENTRIES based on that condition, too! 
data[condition] = np.nan
print(data)
print('')

# now we can see the true statistics of the data
print('max',np.nanmax(data),'min',np.nanmin(data),'mean',np.nanmean(data))

[4.  5.  3.2 nan 6.  5.4 7.8 9.2 nan 3.1 6.  8. ]
max 9.2 min 3.1 mean 5.77


In [62]:
# How about if we DON'T know the bad data code? Just use a reasonable condition
data = np.asarray([4.,5.,3.2,999.0,6.,5.4,7.8,9.2,999.0,3.1,6.,8.])
condition = data > 500

print(data[condition])

[999. 999.]


# Pandas: A Time-Series Analysis Package

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## A Real Data Example: Opening a CSV into a Pandas Dataframe

In [38]:
# Read in CSV file of CIGLR temperature and precipitation data from ERA5
df = pd.read_csv('era5_ciglr_temp.nc',index_col='time')
df.index = pd.to_datetime(df.index) #<--this converts our time index into Pandas Datetime. It's very useful.
df

Unnamed: 0_level_0,t2,precip
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2019-01-01 00:00:00,277.37897,0.000142
2019-01-01 06:00:00,276.08395,0.000147
2019-01-01 12:00:00,275.01074,0.000003
2019-01-01 18:00:00,274.60864,0.000014
2019-01-02 00:00:00,272.00864,0.000016
...,...,...
2021-12-30 18:00:00,273.96036,0.000004
2021-12-31 00:00:00,274.14960,0.000012
2021-12-31 06:00:00,274.61150,0.000071
2021-12-31 12:00:00,277.37332,0.000033


In [39]:
# right now, "t2" is temperature in Kelvin. Let's convert to Celsius 
# T(C) = T(K) - 273.15

df['t2'] = df['t2']-273.15
print(df.head()) # this just prints the first 5 entries


# ACTIVITY!
# The precip column is in units of m (meters). Please convert to units of mm (milimeters)



                          t2    precip
time                                  
2019-01-01 00:00:00  4.22897  0.000142
2019-01-01 06:00:00  2.93395  0.000147
2019-01-01 12:00:00  1.86074  0.000003
2019-01-01 18:00:00  1.45864  0.000014
2019-01-02 00:00:00 -1.14136  0.000016


## Indexing Pandas Dataframes

A dataframe with a Pandas Datetime index (i.e., the first column) is helpful, because we can index our time series based on a desired date, range of dates, year, time, etc. 

In [44]:
# Grab temp and precip data from one specific day
date = pd.to_datetime('2019-04-01 00:00:00')
date
df.loc[df.index.month==5]

Unnamed: 0_level_0,t2,precip
time,Unnamed: 1_level_1,Unnamed: 2_level_1
