### Time Series in files P1
- Source: https://www.kaggle.com/datasets/suyashlakhani/apple-stock-prices-20152020

In [2]:
import pandas as pd

In [3]:
cols = ['date', 'high', 'low', 'open', 'close']
apple_stock = pd.read_csv('AAPL.csv', usecols=cols)
apple_stock

Unnamed: 0,date,close,high,low,open
0,2015-05-27 00:00:00+00:00,132.045,132.260,130.0500,130.34
1,2015-05-28 00:00:00+00:00,131.780,131.950,131.1000,131.86
2,2015-05-29 00:00:00+00:00,130.280,131.450,129.9000,131.23
3,2015-06-01 00:00:00+00:00,130.535,131.390,130.0500,131.20
4,2015-06-02 00:00:00+00:00,129.960,130.655,129.3200,129.86
...,...,...,...,...,...
1253,2020-05-18 00:00:00+00:00,314.960,316.500,310.3241,313.17
1254,2020-05-19 00:00:00+00:00,313.140,318.520,313.0100,315.03
1255,2020-05-20 00:00:00+00:00,319.230,319.520,316.2000,316.68
1256,2020-05-21 00:00:00+00:00,316.850,320.890,315.8700,318.66


#### Reading datetime objects from files

In [None]:
apple_stock.info()        # The date is considered just an object here

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    1258 non-null   object 
 1   close   1258 non-null   float64
 2   high    1258 non-null   float64
 3   low     1258 non-null   float64
 4   open    1258 non-null   float64
dtypes: float64(4), object(1)
memory usage: 49.3+ KB


In [5]:
apple_stock.set_index('date', inplace=True)

In [None]:
apple_stock.index    # Now, we have dates as our index, but they are still considered an object

Index(['2015-05-27 00:00:00+00:00', '2015-05-28 00:00:00+00:00',
       '2015-05-29 00:00:00+00:00', '2015-06-01 00:00:00+00:00',
       '2015-06-02 00:00:00+00:00', '2015-06-03 00:00:00+00:00',
       '2015-06-04 00:00:00+00:00', '2015-06-05 00:00:00+00:00',
       '2015-06-08 00:00:00+00:00', '2015-06-09 00:00:00+00:00',
       ...
       '2020-05-11 00:00:00+00:00', '2020-05-12 00:00:00+00:00',
       '2020-05-13 00:00:00+00:00', '2020-05-14 00:00:00+00:00',
       '2020-05-15 00:00:00+00:00', '2020-05-18 00:00:00+00:00',
       '2020-05-19 00:00:00+00:00', '2020-05-20 00:00:00+00:00',
       '2020-05-21 00:00:00+00:00', '2020-05-22 00:00:00+00:00'],
      dtype='object', name='date', length=1258)

In [7]:
pd.to_datetime(apple_stock.index)     # They are finally considered datetime object

DatetimeIndex(['2015-05-27 00:00:00+00:00', '2015-05-28 00:00:00+00:00',
               '2015-05-29 00:00:00+00:00', '2015-06-01 00:00:00+00:00',
               '2015-06-02 00:00:00+00:00', '2015-06-03 00:00:00+00:00',
               '2015-06-04 00:00:00+00:00', '2015-06-05 00:00:00+00:00',
               '2015-06-08 00:00:00+00:00', '2015-06-09 00:00:00+00:00',
               ...
               '2020-05-11 00:00:00+00:00', '2020-05-12 00:00:00+00:00',
               '2020-05-13 00:00:00+00:00', '2020-05-14 00:00:00+00:00',
               '2020-05-15 00:00:00+00:00', '2020-05-18 00:00:00+00:00',
               '2020-05-19 00:00:00+00:00', '2020-05-20 00:00:00+00:00',
               '2020-05-21 00:00:00+00:00', '2020-05-22 00:00:00+00:00'],
              dtype='datetime64[ns, UTC]', name='date', length=1258, freq=None)

#### We can also directly do that using ```parse_dates``` when reading csv

In [8]:
apple_stock = pd.read_csv('AAPL.csv', usecols=cols, parse_dates=['date'], index_col='date')
apple_stock

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2015-05-27 00:00:00+00:00,132.045,132.260,130.0500,130.34
2015-05-28 00:00:00+00:00,131.780,131.950,131.1000,131.86
2015-05-29 00:00:00+00:00,130.280,131.450,129.9000,131.23
2015-06-01 00:00:00+00:00,130.535,131.390,130.0500,131.20
2015-06-02 00:00:00+00:00,129.960,130.655,129.3200,129.86
...,...,...,...,...
2020-05-18 00:00:00+00:00,314.960,316.500,310.3241,313.17
2020-05-19 00:00:00+00:00,313.140,318.520,313.0100,315.03
2020-05-20 00:00:00+00:00,319.230,319.520,316.2000,316.68
2020-05-21 00:00:00+00:00,316.850,320.890,315.8700,318.66


#### Cool! Now, we can use them as datetime objects

In [9]:
days = apple_stock.index.day_name()      # Get the day name for each date
apple_stock.insert(0, 'day', days)
apple_stock

Unnamed: 0_level_0,day,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-05-27 00:00:00+00:00,Wednesday,132.045,132.260,130.0500,130.34
2015-05-28 00:00:00+00:00,Thursday,131.780,131.950,131.1000,131.86
2015-05-29 00:00:00+00:00,Friday,130.280,131.450,129.9000,131.23
2015-06-01 00:00:00+00:00,Monday,130.535,131.390,130.0500,131.20
2015-06-02 00:00:00+00:00,Tuesday,129.960,130.655,129.3200,129.86
...,...,...,...,...,...
2020-05-18 00:00:00+00:00,Monday,314.960,316.500,310.3241,313.17
2020-05-19 00:00:00+00:00,Tuesday,313.140,318.520,313.0100,315.03
2020-05-20 00:00:00+00:00,Wednesday,319.230,319.520,316.2000,316.68
2020-05-21 00:00:00+00:00,Thursday,316.850,320.890,315.8700,318.66


In [None]:
apple_stock.info()     # The index is a datetime index now

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2015-05-27 00:00:00+00:00 to 2020-05-22 00:00:00+00:00
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   day     1258 non-null   object 
 1   close   1258 non-null   float64
 2   high    1258 non-null   float64
 3   low     1258 non-null   float64
 4   open    1258 non-null   float64
dtypes: float64(4), object(1)
memory usage: 59.0+ KB


### Time Series in files P2

In [12]:
cols = ['date', 'high', 'low', 'open', 'close']
apple_stock = pd.read_csv('AAPL.csv', usecols=cols, parse_dates=['date'])
apple_stock

Unnamed: 0,date,close,high,low,open
0,2015-05-27 00:00:00+00:00,132.045,132.260,130.0500,130.34
1,2015-05-28 00:00:00+00:00,131.780,131.950,131.1000,131.86
2,2015-05-29 00:00:00+00:00,130.280,131.450,129.9000,131.23
3,2015-06-01 00:00:00+00:00,130.535,131.390,130.0500,131.20
4,2015-06-02 00:00:00+00:00,129.960,130.655,129.3200,129.86
...,...,...,...,...,...
1253,2020-05-18 00:00:00+00:00,314.960,316.500,310.3241,313.17
1254,2020-05-19 00:00:00+00:00,313.140,318.520,313.0100,315.03
1255,2020-05-20 00:00:00+00:00,319.230,319.520,316.2000,316.68
1256,2020-05-21 00:00:00+00:00,316.850,320.890,315.8700,318.66


#### If we read the csv without parsing the date, it will end up as a regular object

In [13]:
apple_stock = pd.read_csv('AAPL.csv', usecols=cols)
apple_stock.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258 entries, 0 to 1257
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   date    1258 non-null   object 
 1   close   1258 non-null   float64
 2   high    1258 non-null   float64
 3   low     1258 non-null   float64
 4   open    1258 non-null   float64
dtypes: float64(4), object(1)
memory usage: 49.3+ KB


#### Let's create a new date format that isn't clean, so that we can try convert it into a valid Timestamp

In [16]:
date = pd.to_datetime(apple_stock['date'])
new_format = date.dt.strftime('%D.%m.%Y__%H:%M__%p')
new_format

0       05/27/15.05.2015__00:00__AM
1       05/28/15.05.2015__00:00__AM
2       05/29/15.05.2015__00:00__AM
3       06/01/15.06.2015__00:00__AM
4       06/02/15.06.2015__00:00__AM
                   ...             
1253    05/18/20.05.2020__00:00__AM
1254    05/19/20.05.2020__00:00__AM
1255    05/20/20.05.2020__00:00__AM
1256    05/21/20.05.2020__00:00__AM
1257    05/22/20.05.2020__00:00__AM
Name: date, Length: 1258, dtype: object

#### Replace the old index with the new format

In [18]:
apple_stock.set_index(new_format, inplace=True)
apple_stock.drop(columns='date', inplace=True)
apple_stock

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
05/27/15.05.2015__00:00__AM,132.045,132.260,130.0500,130.34
05/28/15.05.2015__00:00__AM,131.780,131.950,131.1000,131.86
05/29/15.05.2015__00:00__AM,130.280,131.450,129.9000,131.23
06/01/15.06.2015__00:00__AM,130.535,131.390,130.0500,131.20
06/02/15.06.2015__00:00__AM,129.960,130.655,129.3200,129.86
...,...,...,...,...
05/18/20.05.2020__00:00__AM,314.960,316.500,310.3241,313.17
05/19/20.05.2020__00:00__AM,313.140,318.520,313.0100,315.03
05/20/20.05.2020__00:00__AM,319.230,319.520,316.2000,316.68
05/21/20.05.2020__00:00__AM,316.850,320.890,315.8700,318.66


#### Create a csv file that contains our poorly formatted dates

In [19]:
apple_stock.to_csv('bad_apple.csv')

In [20]:
# Read our csv and parse the dates (It will fail)
bad_apple = pd.read_csv('bad_apple.csv', parse_dates=['date'])
bad_apple

  bad_apple = pd.read_csv('bad_apple.csv', parse_dates=['date'])


Unnamed: 0,date,close,high,low,open
0,05/27/15.05.2015__00:00__AM,132.045,132.260,130.0500,130.34
1,05/28/15.05.2015__00:00__AM,131.780,131.950,131.1000,131.86
2,05/29/15.05.2015__00:00__AM,130.280,131.450,129.9000,131.23
3,06/01/15.06.2015__00:00__AM,130.535,131.390,130.0500,131.20
4,06/02/15.06.2015__00:00__AM,129.960,130.655,129.3200,129.86
...,...,...,...,...,...
1253,05/18/20.05.2020__00:00__AM,314.960,316.500,310.3241,313.17
1254,05/19/20.05.2020__00:00__AM,313.140,318.520,313.0100,315.03
1255,05/20/20.05.2020__00:00__AM,319.230,319.520,316.2000,316.68
1256,05/21/20.05.2020__00:00__AM,316.850,320.890,315.8700,318.66


#### Try to convert into a timestamp

In [None]:
pd.to_datetime(bad_apple['date'])     # Doesn't work

  pd.to_datetime(bad_apple['date'])


DateParseError: Unknown datetime string format, unable to parse: 05/27/15.05.2015__00:00__AM, at position 0

In [38]:
good_dates = pd.to_datetime(bad_apple['date'], format='%D.%M.%Y__%H:%M__%p')
good_dates

ValueError: 'D' is a bad directive in format '%D.%M.%Y__%H:%M__%p'

In [None]:
bad_apple.set_index(good_dates, inplace=True)
bad_apple.drop(columns='date', inplace=True)
bad_apple                                      # Although the output is wrong bcz of the above error

Unnamed: 0_level_0,close,high,low,open
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
05/27/15.05.2015__00:00__AM,132.045,132.260,130.0500,130.34
05/28/15.05.2015__00:00__AM,131.780,131.950,131.1000,131.86
05/29/15.05.2015__00:00__AM,130.280,131.450,129.9000,131.23
06/01/15.06.2015__00:00__AM,130.535,131.390,130.0500,131.20
06/02/15.06.2015__00:00__AM,129.960,130.655,129.3200,129.86
...,...,...,...,...
05/18/20.05.2020__00:00__AM,314.960,316.500,310.3241,313.17
05/19/20.05.2020__00:00__AM,313.140,318.520,313.0100,315.03
05/20/20.05.2020__00:00__AM,319.230,319.520,316.2000,316.68
05/21/20.05.2020__00:00__AM,316.850,320.890,315.8700,318.66
