## Dealing with NaN's
### NaN in python

In [1]:
n1 = float("nan")
n2 = float("Nan")
n3 = float("NaN")
n4 = float("NAN")
print(n1, n2, n3, n4)

nan nan nan nan


In [2]:
import math

n1 = math.nan
print(n1)

nan


### NaN in pandas
### Example without NaN

In [36]:
import pandas as pd

df = pd.read_csv('data/temperatures.csv', sep=';', decimal=',')
df[:5]

Unnamed: 0,time,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
0,06:00:00,14.3,13.7,14.2,14.3,13.5,13.6
1,06:15:00,14.5,14.5,14.0,15.0,14.5,14.7
2,06:30:00,14.6,15.1,14.8,15.3,14.0,14.2
3,06:45:00,14.8,14.5,15.6,15.2,14.7,14.6
4,07:00:00,15.0,14.9,15.7,15.6,14.0,15.3


I want to calculate the average temperatures per measuring point over all the sensors.

In [37]:
df.mean(axis='rows')

  df.mean(axis='rows')


sensor1    19.775926
sensor2    19.757407
sensor3    19.840741
sensor4    20.187037
sensor5    19.181481
sensor6    19.437037
dtype: float64

In [38]:
average_temp_series = df.mean(axis='columns')
average_temp_series[:5]

  average_temp_series = df.mean(axis='columns')


0    13.933333
1    14.533333
2    14.666667
3    14.900000
4    15.083333
dtype: float64

In [43]:
sensors = df.columns.values[1:]
#all columns except time will be removed

df = df.drop(sensors, axis=1)

I'll assign the average temperature values as a new column 'temperature':

In [46]:
df = df.assign(temperature=average_temp_series)
df[:3]

Unnamed: 0,time,temperature
0,06:00:00,13.933333
1,06:15:00,14.533333
2,06:30:00,14.666667


### Example with NaN

In [49]:
import pandas as pd

temp_df = pd.read_csv("data/temperatures.csv",sep=";", index_col=0, decimal=",")
temp_df[:3]

Unnamed: 0_level_0,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
06:00:00,14.3,13.7,14.2,14.3,13.5,13.6
06:15:00,14.5,14.5,14.0,15.0,14.5,14.7
06:30:00,14.6,15.1,14.8,15.3,14.0,14.2


Demonstration of how where works

In [53]:
s = pd.Series(range(4))
s.where(s>1)

0    NaN
1    NaN
2    2.0
3    3.0
dtype: float64

In [54]:
import numpy as np
import pandas as pd

A = np.random.randint(1, 30, (4,2))
df = pd.DataFrame(A, columns=['Foo', 'Bar'])
df

Unnamed: 0,Foo,Bar
0,20,14
1,19,12
2,12,11
3,21,29


In [60]:
-df

Unnamed: 0,Foo,Bar
0,-20,-14
1,19,-12
2,-12,11
3,21,29


In [59]:
m = df % 2 == 0
df.where(m, -df, inplace=True)
df

Unnamed: 0,Foo,Bar
0,20,14
1,-19,12
2,12,-11
3,-21,-29


In [67]:
random_df = pd.DataFrame(np.random.random(size=temp_df.shape), columns=temp_df.columns.values, index=temp_df.index)
random_df[:3]

Unnamed: 0_level_0,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
06:00:00,0.096018,0.17786,0.127828,0.347214,0.242017,0.485551
06:15:00,0.620883,0.119218,0.486433,0.21627,0.901529,0.585665
06:30:00,0.785559,0.091942,0.197307,0.041134,0.388347,0.679264


In [69]:
nan_df = pd.DataFrame(np.nan,columns=temp_df.columns.values, index=temp_df.index)
nan_df[:3]

Unnamed: 0_level_0,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
06:00:00,,,,,,
06:15:00,,,,,,
06:30:00,,,,,,


In [71]:
df_bool = random_df < 0.8
df_bool[:3]

Unnamed: 0_level_0,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
06:00:00,True,True,True,True,True,True
06:15:00,True,True,True,True,False,True
06:30:00,True,True,True,True,True,True


In [83]:
disturbed_data = temp_df.where(df_bool, nan_df)
disturbed_data[:3]

Unnamed: 0_level_0,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
06:00:00,14.3,13.7,14.2,14.3,13.5,13.6
06:15:00,14.5,14.5,14.0,15.0,,14.7
06:30:00,14.6,15.1,14.8,15.3,14.0,14.2


In [74]:
disturbed_data.to_csv('data/temperatures_with_NAN.csv')
disturbed_data[:5]

Unnamed: 0_level_0,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
06:00:00,14.3,13.7,14.2,14.3,13.5,13.6
06:15:00,14.5,14.5,14.0,15.0,,14.7
06:30:00,14.6,15.1,14.8,15.3,14.0,14.2
06:45:00,14.8,14.5,15.6,15.2,14.7,14.6
07:00:00,15.0,14.9,15.7,15.6,14.0,


### Using Dropna on the DataFrame

In [79]:
df = disturbed_data.dropna()  #if axis=1, dropna drops columns
df[:5]

Unnamed: 0_level_0,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
06:00:00,14.3,13.7,14.2,14.3,13.5,13.6
06:30:00,14.6,15.1,14.8,15.3,14.0,14.2
06:45:00,14.8,14.5,15.6,15.2,14.7,14.6
07:30:00,15.4,15.3,15.6,15.6,14.7,15.1
08:00:00,15.7,15.6,15.9,16.2,15.4,15.4


I'll take out all the rows, where more than one NaN value occurred.

In [84]:
cleansed_df = disturbed_data.dropna(thresh=5, axis=0)
cleansed_df[:5]

Unnamed: 0_level_0,sensor1,sensor2,sensor3,sensor4,sensor5,sensor6
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
06:00:00,14.3,13.7,14.2,14.3,13.5,13.6
06:15:00,14.5,14.5,14.0,15.0,,14.7
06:30:00,14.6,15.1,14.8,15.3,14.0,14.2
06:45:00,14.8,14.5,15.6,15.2,14.7,14.6
07:00:00,15.0,14.9,15.7,15.6,14.0,


In [92]:
average_temp_series = cleansed_df.mean(axis=1)
sensors = cleansed_df.columns.values
df = cleansed_df.drop(sensors, axis=1)
df = df.assign(temperature=average_temp_series)
df[:5]

Unnamed: 0_level_0,temperature
time,Unnamed: 1_level_1
06:00:00,13.933333
06:15:00,14.54
06:30:00,14.666667
06:45:00,14.9
07:00:00,15.04


## Binning in Python and Pandas
The following function can be used to create bins

In [1]:
def create_bins(lower_bound, width, quantity):
    """
    Create bins returns an equal_width (distance) partitioning.
    It returns an ascending list of tuple, representing the i intervals.
    A tuple bins[i], i.e. (bins[i][0], bins[i][1]) with i > 0 and i < quantity, satisfies the following conditions:
              (1) bins[i][0] + width == bins[i][1]
              (2) bins[i-1][0] + width == bins[i][0] and
              (3) bins[i-1][1] + width == bins[i][1]
    """
    bins = []
    for low in range(lower_bound, lower_bound + quantity*width, width):
        bins.append((low, low + width))
    return bins

In [2]:
bins = create_bins(10, 10, 5)
bins

[(10, 20), (20, 30), (30, 40), (40, 50), (50, 60)]

In [3]:
def find_bin(value, bins):
    """
    bins is a list of tuples, like [(0,20), (20, 40), (40, 60)],
    binning returns the smallest index i of bins so that
    bin[i][0] <= value < bin[i][1]
    
    """
    for i in range(0, len(bins)):
        if bins[i][0] <= value < bins[i][1]:
            return i

In [4]:
from collections import Counter

bins = create_bins(50, 4, 11)
weights_of_persons = [73.4, 69.3, 64.9, 75.6, 74.9, 80.3, 78.6, 84.1, 88.9, 
                      90.3, 83.4, 69.3,52.4, 58.3, 67.4, 74.0, 89.3, 63.4]

binned_weights = []
for value in weights_of_persons:
    bin_index = find_bin(value, bins)
    print(value, bin_index, bins[bin_index])
    binned_weights.append(bin_index)
    
frequencies = Counter(binned_weights)
print(frequencies)

73.4 5 (70, 74)
69.3 4 (66, 70)
64.9 3 (62, 66)
75.6 6 (74, 78)
74.9 6 (74, 78)
80.3 7 (78, 82)
78.6 7 (78, 82)
84.1 8 (82, 86)
88.9 9 (86, 90)
90.3 10 (90, 94)
83.4 8 (82, 86)
69.3 4 (66, 70)
52.4 0 (50, 54)
58.3 2 (58, 62)
67.4 4 (66, 70)
74.0 6 (74, 78)
89.3 9 (86, 90)
63.4 3 (62, 66)
Counter({4: 3, 6: 3, 3: 2, 7: 2, 8: 2, 9: 2, 5: 1, 10: 1, 0: 1, 2: 1})


### Binning with Pandas

In [5]:
import pandas as pd

bins2 = pd.IntervalIndex.from_tuples(bins, closed='left')
bins2

IntervalIndex([[50, 54), [54, 58), [58, 62), [62, 66), [66, 70) ... [74, 78), [78, 82), [82, 86), [86, 90), [90, 94)], dtype='interval[int64, left]')

In [6]:
categorical_object = pd.cut(weights_of_persons, bins2)
print(categorical_object)

[[70, 74), [66, 70), [62, 66), [74, 78), [74, 78), ..., [58, 62), [66, 70), [74, 78), [86, 90), [62, 66)]
Length: 18
Categories (11, interval[int64, left]): [[50, 54) < [54, 58) < [58, 62) < [62, 66) ... [78, 82) < [82, 86) < [86, 90) < [90, 94)]


### Other ways to define bins

In [8]:
categorical_object = pd.cut(weights_of_persons, 18)
print(categorical_object)

[(71.35, 73.456], (69.244, 71.35], (62.928, 65.033], (75.561, 77.667], (73.456, 75.561], ..., (56.611, 58.717], (67.139, 69.244], (73.456, 75.561], (88.194, 90.3], (62.928, 65.033]]
Length: 18
Categories (18, interval[float64, right]): [(52.362, 54.506] < (54.506, 56.611] < (56.611, 58.717] < (58.717, 60.822] ... (81.878, 83.983] < (83.983, 86.089] < (86.089, 88.194] < (88.194, 90.3]]


In [13]:
sequence_of_scalars = [ x[0] for x in bins]
sequence_of_scalars.append(bins[-1][1])
print(sequence_of_scalars)
categorical_object = pd.cut(weights_of_persons, sequence_of_scalars, right=False)
categorical_object

[50, 54, 58, 62, 66, 70, 74, 78, 82, 86, 90, 94]


[[70, 74), [66, 70), [62, 66), [74, 78), [74, 78), ..., [58, 62), [66, 70), [74, 78), [86, 90), [62, 66)]
Length: 18
Categories (11, interval[int64, left]): [[50, 54) < [54, 58) < [58, 62) < [62, 66) ... [78, 82) < [82, 86) < [86, 90) < [90, 94)]

### Bin counts and Value counts

In [16]:
pd.value_counts(categorical_object)

[66, 70)    3
[74, 78)    3
[62, 66)    2
[78, 82)    2
[82, 86)    2
[86, 90)    2
[50, 54)    1
[58, 62)    1
[70, 74)    1
[90, 94)    1
[54, 58)    0
dtype: int64

"categorical_object.codes" provides me with a labelling of the input values into the binning categories

In [17]:
labels = categorical_object.codes
labels

array([ 5,  4,  3,  6,  6,  7,  7,  8,  9, 10,  8,  4,  0,  2,  4,  6,  9,
        3], dtype=int8)

Categories is the IntervalIndex of the categories of the label indices:

In [18]:
categories = categorical_object.categories
categories

IntervalIndex([[50, 54), [54, 58), [58, 62), [62, 66), [66, 70) ... [74, 78), [78, 82), [82, 86), [86, 90), [90, 94)], dtype='interval[int64, left]')

In [23]:
for index in range(len(weights_of_persons)):
    label_index = labels[index]
    print(weights_of_persons[index], label_index, categories[label_index])

73.4 5 [70, 74)
69.3 4 [66, 70)
64.9 3 [62, 66)
75.6 6 [74, 78)
74.9 6 [74, 78)
80.3 7 [78, 82)
78.6 7 [78, 82)
84.1 8 [82, 86)
88.9 9 [86, 90)
90.3 10 [90, 94)
83.4 8 [82, 86)
69.3 4 [66, 70)
52.4 0 [50, 54)
58.3 2 [58, 62)
67.4 4 [66, 70)
74.0 6 [74, 78)
89.3 9 [86, 90)
63.4 3 [62, 66)


### Naming Bins

In [29]:
degrees = ['None', 'cum laude', 'Magna cum laude', 'summa cum laude']
student_results = [3.93, 3.24, 2.80, 2.83, 3.91, 3.698, 3.731, 3.25, 3.24, 3.82, 3.22]

student_result_degrees = pd.cut(student_results, [0, 3.6, 3.8, 3.9, 4.0], labels=degrees)
pd.value_counts(student_result_degrees)

None               6
cum laude          2
summa cum laude    2
Magna cum laude    1
dtype: int64

In [32]:
labels = student_result_degrees.codes
categories = student_result_degrees.categories
for index in range(len(student_results)):
    label_index = labels[index]
    print(student_results[index], label_index, categories[label_index] )

3.93 3 summa cum laude
3.24 0 None
2.8 0 None
2.83 0 None
3.91 3 summa cum laude
3.698 1 cum laude
3.731 1 cum laude
3.25 0 None
3.24 0 None
3.82 2 Magna cum laude
3.22 0 None


## Advanced or Muti-level indexing

In [35]:
import pandas as pd

cities = ["Vienna", "Vienna", "Vienna",
          "Hamburg", "Hamburg", "Hamburg",
          "Berlin", "Berlin", "Berlin",
          "Zürich", "Zürich", "Zürich"]

index = [cities, ["country", "area", "population",
                  "country", "area", "population",
                  "country", "area", "population",
                  "country", "area", "population"]]

print(index)

[['Vienna', 'Vienna', 'Vienna', 'Hamburg', 'Hamburg', 'Hamburg', 'Berlin', 'Berlin', 'Berlin', 'Zürich', 'Zürich', 'Zürich'], ['country', 'area', 'population', 'country', 'area', 'population', 'country', 'area', 'population', 'country', 'area', 'population']]


In [37]:
data = ["Austria", 414.60, 1805681,
        "Germany", 755.00, 1760433,
        "Germany", 891.85, 3562166,
        "Switzerland", 87.88, 378884]
df  = pd.Series(data, index=index)
df

Vienna   country           Austria
         area                414.6
         population        1805681
Hamburg  country           Germany
         area                755.0
         population        1760433
Berlin   country           Germany
         area               891.85
         population        3562166
Zürich   country       Switzerland
         area                87.88
         population         378884
dtype: object

I can access the data of a city in the following way:

In [39]:
df['Vienna']

country       Austria
area            414.6
population    1805681
dtype: object

I can also access the information about the country, area or population of a city. I can do this in two ways:

In [40]:
df['Vienna']['area']

414.6

In [41]:
df['Vienna', 'area']

414.6

I can also get the content of multiple cities at the same time by using a list of city names as the key.

In [50]:
df['Berlin',:]

country       Germany
area           891.85
population    3562166
dtype: object

If the index is sorted, I can also apply a slicing operation:

In [54]:
city_series = df.sort_index()
print("city_series with sorted index:")
print(city_series)

city_series with sorted index:
Berlin   area               891.85
         country           Germany
         population        3562166
Hamburg  area                755.0
         country           Germany
         population        1760433
Vienna   area                414.6
         country           Austria
         population        1805681
Zürich   area                87.88
         country       Switzerland
         population         378884
dtype: object


In [55]:
print("Slicing the city_series:")
city_series["Berlin":"Vienna"]

Slicing the city_series:


Berlin   area           891.85
         country       Germany
         population    3562166
Hamburg  area            755.0
         country       Germany
         population    1760433
Vienna   area            414.6
         country       Austria
         population    1805681
dtype: object

It is also possible to access the inner keys as well

In [57]:
print(city_series[:, "population"])

Berlin     3562166
Hamburg    1760433
Vienna     1805681
Zürich      378884
dtype: object


### Swapping multiindex levels

In [64]:
city_series = city_series.swaplevel()
city_series.sort_index(inplace=True)
city_series

Berlin   area               891.85
         country           Germany
         population        3562166
Hamburg  area                755.0
         country           Germany
         population        1760433
Vienna   area                414.6
         country           Austria
         population        1805681
Zürich   area                87.88
         country       Switzerland
         population         378884
dtype: object

## Python date and time

### The date class

In [67]:
from datetime import date
x = date(1999, 11, 15)
print(x)

1999-11-15


In [68]:
print(date.min)
print(date.max)

0001-01-01
9999-12-31


I can apply various methods to the date instance above. The method toordinal returns the proleptic Gregorian ordinal.

In [69]:
x.toordinal()

730073

It is also possible to calculate a date from a ordinal by using the class method "fromordinal"

In [71]:
date.fromordinal(730073)

datetime.date(1999, 11, 15)

If I want to know the weekday of a certain date, I can calculate it by using the method weekday:

In [72]:
x.weekday()  #monday == 0 --- sunday == 6

0

In [73]:
date.today()

datetime.date(2023, 3, 16)

I can access the day, month and year with attributes:

In [74]:
print(x.day)
print(x.month)
print(x.year)

15
11
1999


### The time class

In [75]:
from datetime import time
t = time(10, 4, 45)
print(t)

10:04:45


The possible times range between:

In [76]:
print(time.min)
print(time.max)

00:00:00
23:59:59.999999


Accessing 'hour', 'minute' and 'second':

In [77]:
t.hour, t.minute, t.second

(10, 4, 45)

Each component of a time instance can be changed by using 'replace':

In [78]:
t = t.replace(minute=34, second=20)
t

datetime.time(10, 34, 20)

I can render a date as a C-style like string, corresponding to the C ctime function:

In [79]:
x.ctime()

'Mon Nov 15 00:00:00 1999'

### The datetime class
The datetime module provides us with functions and methods for manipulating dates and times. There are two kinds of date and time objects:
• naive
• aware

In [80]:
from datetime import datetime
t = datetime(2017, 4, 19, 16, 31, 0)
t

datetime.datetime(2017, 4, 19, 16, 31)

In [81]:
#t is naive, because the following is True:
t.tzinfo == None

True

In [84]:
from datetime import datetime
import pytz

t = datetime.now(pytz.utc)
t

datetime.datetime(2023, 3, 16, 7, 44, 51, 255237, tzinfo=<UTC>)

### Converting datetime objects into strings

In [85]:
d1 = datetime(1991, 4, 30)
s = str(d1)
s

'1991-04-30 00:00:00'

### Creating datetime objects from strings

In [89]:
from datetime import datetime

t = datetime.strptime('15 November 1999', '%d %B %Y')
t

datetime.datetime(1999, 11, 15, 0, 0)

In [90]:
dt = "2007-03-04T21:08:12"
datetime.strptime(dt, "%Y-%m-%dT%H:%M:%S" )

datetime.datetime(2007, 3, 4, 21, 8, 12)

In [91]:
dt = '12/24/1957 4:03:29 AM'
dt = datetime.strptime(dt, '%m/%d/%Y %I:%M:%S %p')
dt

datetime.datetime(1957, 12, 24, 4, 3, 29)

In [94]:
dt = 'Wednesday April 12 20:29:53 2017'
dt = datetime.strptime(dt, '%A %B %d %H:%M:%S %Y')
print(dt)

2017-04-12 20:29:53


Though datetime.strptime is an easy way to parse a date with a known format, it can be quote complicated and cumbersome to write every time a new specification string for new date formats.

In [95]:
from dateutil.parser import parse
parse('15 November 1999')

datetime.datetime(1999, 11, 15, 0, 0)

In [98]:
t = parse('Wednesday April 12 20:29:53 2017')
t

datetime.datetime(2017, 4, 12, 20, 29, 53)

## Python, Pandas and Time series

### Time series in pandas and python

In [103]:
import numpy as np
import pandas as pd

from datetime import datetime, timedelta as delta

ndays = 10
start = datetime(2017, 3, 31)
dates = [(start - delta(days=x)) for x in range(0, ndays)]
values = [25, 50, 15, 67, 70, 9, 28, 30, 32, 12]

ts = pd.Series(values, index=dates)
ts

2017-03-31    25
2017-03-30    50
2017-03-29    15
2017-03-28    67
2017-03-27    70
2017-03-26     9
2017-03-25    28
2017-03-24    30
2017-03-23    32
2017-03-22    12
dtype: int64

In [104]:
type(ts)

pandas.core.series.Series

In [105]:
ts.index

DatetimeIndex(['2017-03-31', '2017-03-30', '2017-03-29', '2017-03-28',
               '2017-03-27', '2017-03-26', '2017-03-25', '2017-03-24',
               '2017-03-23', '2017-03-22'],
              dtype='datetime64[ns]', freq=None)

In [106]:
values2 = [32, 54, 18, 61, 72, 19, 21, 33, 29, 17]
ts2 = pd.Series(values2, index=dates)
ts2

2017-03-31    32
2017-03-30    54
2017-03-29    18
2017-03-28    61
2017-03-27    72
2017-03-26    19
2017-03-25    21
2017-03-24    33
2017-03-23    29
2017-03-22    17
dtype: int64

It is possible to use arithmetic operations on time series like we did with other series

In [107]:
ts + ts2

2017-03-31     57
2017-03-30    104
2017-03-29     33
2017-03-28    128
2017-03-27    142
2017-03-26     28
2017-03-25     49
2017-03-24     63
2017-03-23     61
2017-03-22     29
dtype: int64

Arithmetic mean between both Series.

In [109]:
(ts + ts2) / 2

2017-03-31    28.5
2017-03-30    52.0
2017-03-29    16.5
2017-03-28    64.0
2017-03-27    71.0
2017-03-26    14.0
2017-03-25    24.5
2017-03-24    31.5
2017-03-23    30.5
2017-03-22    14.5
dtype: float64

In [112]:
import pandas as pd
from datetime import datetime, timedelta as delta

ndays = 10
start = datetime(2017, 3, 31)
dates = [start - delta(days=x) for x in range(0, ndays)]

start2 = datetime(2017, 3, 26)
dates2 = [start2 - delta(days=x) for x in range(0, ndays)]

values = [25, 50, 15, 67, 70, 9, 28, 30, 32, 12]
values2 = [32, 54, 18, 61, 72, 19, 21, 33, 29, 17]

ts = pd.Series(values, index=dates)
ts2 = pd.Series(values2, index=dates2)

In [114]:
ts

2017-03-31    25
2017-03-30    50
2017-03-29    15
2017-03-28    67
2017-03-27    70
2017-03-26     9
2017-03-25    28
2017-03-24    30
2017-03-23    32
2017-03-22    12
dtype: int64

In [115]:
ts2

2017-03-26    32
2017-03-25    54
2017-03-24    18
2017-03-23    61
2017-03-22    72
2017-03-21    19
2017-03-20    21
2017-03-19    33
2017-03-18    29
2017-03-17    17
dtype: int64

In [116]:
ts + ts2

2017-03-17     NaN
2017-03-18     NaN
2017-03-19     NaN
2017-03-20     NaN
2017-03-21     NaN
2017-03-22    84.0
2017-03-23    93.0
2017-03-24    48.0
2017-03-25    82.0
2017-03-26    41.0
2017-03-27     NaN
2017-03-28     NaN
2017-03-29     NaN
2017-03-30     NaN
2017-03-31     NaN
Freq: D, dtype: float64

### Creating date ranges
The date_range method of the pandas module can be used to generate a DatetimeIndex:

In [117]:
import pandas as pd

dates = pd.date_range('12/24/1970', '01/03/1971')
dates

DatetimeIndex(['1970-12-24', '1970-12-25', '1970-12-26', '1970-12-27',
               '1970-12-28', '1970-12-29', '1970-12-30', '1970-12-31',
               '1971-01-01', '1971-01-02', '1971-01-03'],
              dtype='datetime64[ns]', freq='D')

I passed a start and an end date to date_range in my previous example. It is also possible to pass only a start or an end date to the function.

In [118]:
dates = pd.date_range(start='12/24/1970', periods=5)
dates

DatetimeIndex(['1970-12-24', '1970-12-25', '1970-12-26', '1970-12-27',
               '1970-12-28'],
              dtype='datetime64[ns]', freq='D')

In [119]:
dates = pd.date_range(end='12/24/1970', periods=3)
dates

DatetimeIndex(['1970-12-22', '1970-12-23', '1970-12-24'], dtype='datetime64[ns]', freq='D')

I can also create time frequencies, which consists only of business days by setting the keyword parameter 'freq' to the string 'B':

In [121]:
dates = pd.date_range('2023-3-10', '2023-3-17', freq='B')
dates

DatetimeIndex(['2023-03-10', '2023-03-13', '2023-03-14', '2023-03-15',
               '2023-03-16', '2023-03-17'],
              dtype='datetime64[ns]', freq='B')

I'll create a time frequency which contains the month ends between two dates.

In [122]:
dates = pd.date_range('2016-02-25', '2016-07-02', freq="M")
dates

DatetimeIndex(['2016-02-29', '2016-03-31', '2016-04-30', '2016-05-31',
               '2016-06-30'],
              dtype='datetime64[ns]', freq='M')