# Indian Energy data parser

This program is intended to parse Electricity generation history for India.

Sources:

* https://carbontracker.in/ - history since 2019
* http://meritindia.in/ - real-time data

Data is in JSON format.

## Parse data from carbontracker.in

The following code parses raw data loaded from carbontracker.in and saves it as a csv.

In [63]:
import json
import pandas as pd
from pathlib import Path
from typing import List, Dict

# this can change if we move the code to other place
ROOT_DIR: Path = Path(r".\India Data")
OUTPUT_FILE: Path = ROOT_DIR / 'carbontracker_india.csv'

def read_json(path: Path) -> pd.DataFrame:
    print(f'Parsing {path}')
    try:
        json_dict: Dict = json.loads(path.read_bytes().decode('unicode_escape').strip('"'))
        # print(f' keys: {[key for key in json_dict.keys()]}')
        # print(f" keys in timeseries_values: {[key for key in json_dict['timeseries_values'].keys()]}")
        return pd.DataFrame(json_dict['timeseries_values'])
    except json.JSONDecodeError as e:
        print(f' Exception reading file {path}: {e}. Ignoring this file')
        return pd.DataFrame()


df: pd.DataFrame = pd.concat([read_json(path) for path in sorted(ROOT_DIR.glob('*.json'))])

display(df.head())

print(f"Writing results to {OUTPUT_FILE}")

df.to_csv(OUTPUT_FILE, index = False)


Parsing India Data\India_2019-01-01.json
Parsing India Data\India_2019-01-15.json
Parsing India Data\India_2019-02-01.json
Parsing India Data\India_2019-02-15.json
Parsing India Data\India_2019-03-01.json
Parsing India Data\India_2019-03-15.json
Parsing India Data\India_2019-04-01.json
Parsing India Data\India_2019-04-15.json
Parsing India Data\India_2019-05-01.json
Parsing India Data\India_2019-05-15.json
Parsing India Data\India_2019-06-01.json
Parsing India Data\India_2019-06-15.json
Parsing India Data\India_2019-07-01.json
Parsing India Data\India_2019-07-15.json
Parsing India Data\India_2019-08-01.json
Parsing India Data\India_2019-08-15.json
Parsing India Data\India_2019-09-01.json
Parsing India Data\India_2019-09-15.json
Parsing India Data\India_2019-10-01.json
Parsing India Data\India_2019-10-15.json
Parsing India Data\India_2019-11-01.json
Parsing India Data\India_2019-11-15.json
Parsing India Data\India_2019-12-01.json
Parsing India Data\India_2019-12-15.json
Parsing India Da

Unnamed: 0,timestamps,thermal_generation,gas_generation,g_co2_per_kwh,hydro_generation,nuclear_generation,renewable_generation,tons_co2,total_generation,tons_co2_per_mwh,demand_met,net_demand
0,2019-01-01 00:00:00,102238.5,4977.0,834.747057,6418.5,3659.0,4733.0,8488.4037,122026.0,0.834747,121651.0,117293.0
1,2019-01-01 00:05:00,101634.0,5058.0,836.056164,6087.0,3674.0,4721.0,8442.3558,121174.0,0.836056,120765.0,116453.0
2,2019-01-01 00:10:00,101529.0,5113.0,836.776475,5947.0,3667.0,4721.0,8435.8923,120977.0,0.836776,120395.0,116256.0
3,2019-01-01 00:15:00,101328.0,5072.5,835.714962,6083.0,3657.5,4733.5,8418.05235,120874.5,0.835715,120422.5,116141.0
4,2019-01-01 00:20:00,100906.0,5021.0,834.991627,6129.0,3646.0,4757.0,8381.8547,120459.0,0.834992,120180.0,115702.0


Writing results to India Data\carbontracker_india.csv


In [60]:
df.plot(x='timestamps', y='thermal_generation')

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

## Download raw history data from carbontracker.in

The following code downloads again the data.


In [57]:
from datetime import date, datetime
from dateutil.relativedelta import *
from itertools import product
from time import sleep
from random import seed, randint

from pathlib import Path

import requests

YEARS = range(2019, 2021)
MONTHS = range(1, 13)
URL = "https://32u36xakx6.execute-api.us-east-2.amazonaws.com/v4/get-merit-data"
DELAY = 10

ROOT_DIR: Path = Path(r".\India Data")


def _dt_to_str(_date):
    return date.strftime(_date, '%Y-%m-%d %H:%M:%S')

def start_datetime(year, month, day):
    return datetime(year, month, day, hour=0, minute=0, second=0, microsecond=0)

def end_datetime(year, month, day):
    return datetime(year, month, day, hour=23, minute=59, second=59, microsecond=999999)

def get_periods(years, months):
    now = date.today()
    
    for year, month in product(years, months):
        if year == now.year and month > now.month:
            break
        
        datetime_ranges = [(start_datetime(year, month, 1), # start of month
                            end_datetime(year, month, 14)),  # middle of month
                           (start_datetime(year, month, 15), 
                            end_datetime(year, month, 1) + relativedelta(months=1) + relativedelta(days=-1))] # end of month
        
        for date_range in datetime_ranges:
            if now > date_range[0].date():
                yield {"start": _dt_to_str(date_range[0]), "end": _dt_to_str(date_range[1])}

                
def get_data(start_time, end_time):
    querystring = {"start_time":start_time,"end_time": end_time,"corrected_values":"false"}

    payload = ""
    response = requests.request("GET", URL, data=payload, params=querystring)
    
    return response.text if response.ok else None

def write_file(start_date, text):
    file_path = ROOT_DIR / f"India_{start_date[:-9].strip('-')}.json"
    print(f' Writing API response to {file_path}')
    file_path.write_text(text)

seed()
    
for i, period in enumerate(get_periods(YEARS, MONTHS)):
    print(period)
    if i > 1:
        delay = randint(0, DELAY)
        print(f"Waiting {delay} s")
        sleep(delay)
    
    write_file(period['start'], get_data(period['start'], period['end']))
        
            

{'start': '2019-01-01 00:00:00', 'end': '2019-01-14 23:59:59'}
 Writing API response to India Data\India_2019-01-01.json
{'start': '2019-01-15 00:00:00', 'end': '2019-01-31 23:59:59'}
 Writing API response to India Data\India_2019-01-15.json
{'start': '2019-02-01 00:00:00', 'end': '2019-02-14 23:59:59'}
Waiting 8 s
 Writing API response to India Data\India_2019-02-01.json
{'start': '2019-02-15 00:00:00', 'end': '2019-02-28 23:59:59'}
Waiting 0 s
 Writing API response to India Data\India_2019-02-15.json
{'start': '2019-03-01 00:00:00', 'end': '2019-03-14 23:59:59'}
Waiting 8 s
 Writing API response to India Data\India_2019-03-01.json
{'start': '2019-03-15 00:00:00', 'end': '2019-03-31 23:59:59'}
Waiting 7 s
 Writing API response to India Data\India_2019-03-15.json
{'start': '2019-04-01 00:00:00', 'end': '2019-04-14 23:59:59'}
Waiting 9 s
 Writing API response to India Data\India_2019-04-01.json
{'start': '2019-04-15 00:00:00', 'end': '2019-04-30 23:59:59'}
Waiting 7 s
 Writing API respo

## Test the API
Piece of code to test data collection from carbontracker.in.

In [67]:
import requests

url = "https://32u36xakx6.execute-api.us-east-2.amazonaws.com/v4/get-merit-data"

querystring = {"start_time":"2019-01-01 00:00:00","end_time":"2019-02-01 23:59:00","corrected_values":"false"}

payload = ""
response = requests.request("GET", url, data=payload, params=querystring)

print(response.text)

NameError: name 'bla' is not defined

## Get real-time data for India

Next cell gets real-time data from https://meritindia.in


### First attempt with Pandas
It returns an array of tables, the one we want is in the index 4.

In [8]:
import pandas as pd

df_list = pd.read_html('http://meritindia.in')
for i, df in enumerate(df_list):
    print(f'position: {i}')
    display(df)

position: 0


Unnamed: 0,0,1,2
0,,MERIT Merit Order Despatch of Electricity fo...,


position: 1


Unnamed: 0,0
0,POWER PROCUREMENT ON
1,ALL INDIA POWER POSITION (MW) [CURRENT] DEMAN...
2,* Value indicated in map are marginal cost or ...


position: 2


Unnamed: 0,0,1,2
0,,POWER PROCUREMENT ON,


position: 3


Unnamed: 0,0,1,2
0,,ALL INDIA POWER POSITION (MW) [CURRENT],
1,,,


position: 4


Unnamed: 0,0,1,2,3,4,5
0,"DEMANDMET 1,27,233 MW","THERMAL GENERATION 89,803 MW","GAS GENERATION 6,113 MW","NUCLEAR GENERATION 4,625 MW","HYDRO GENERATION 24,000 MW","RENEWABLE GENERATION 4,264 MW"


position: 5


Unnamed: 0,0,1,2,3
0,> 6.00 5.00 4.00 3.50 3.00 2.40 < 1.00 ...,,Displaying Data For Marginal Cost Auto Mode,


position: 6


Unnamed: 0,0,1
0,,> 6.00
1,,
2,,
3,,
4,,
5,,5.00
6,,
7,,
8,,
9,,4.00


position: 7


Unnamed: 0,0
0,* Value indicated in map are marginal cost or ...
1,Visitors


In [6]:
display(df_list[4])

Unnamed: 0,0,1,2,3,4,5
0,"DEMANDMET 1,17,810 MW","THERMAL GENERATION 77,689 MW","GAS GENERATION 6,246 MW","NUCLEAR GENERATION 4,714 MW","HYDRO GENERATION 13,512 MW","RENEWABLE GENERATION 16,080 MW"


In [101]:
from datetime import datetime

df = df_list[4]
df.columns = ['demandmet', 'thermal generation', 'gas generation', 'nuclear generation', 'hydro generation', 'renewable generation']
df = df.melt(var_name='indicator', value_name='value')
df['value'] = df['value'].str.extract(r'([\d,]+)').applymap(lambda x: int(x.replace(',', '')))
df['unit'] = 'MW'
df['time'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df

Unnamed: 0,indicator,value,unit,time
0,demandmet,117810,MW,2020-04-29 14:16:28
1,thermal generation,77689,MW,2020-04-29 14:16:28
2,gas generation,6246,MW,2020-04-29 14:16:28
3,nuclear generation,4714,MW,2020-04-29 14:16:28
4,hydro generation,13512,MW,2020-04-29 14:16:28
5,renewable generation,16080,MW,2020-04-29 14:16:28


In [129]:
df2 = df_list[4]
df2.columns = ['demand_met', 'thermal_generation', 'gas_generation', 'nuclear_generation', 'hydro_generation', 'renewable_generation']


In [130]:
df2

Unnamed: 0,demand_met,thermal_generation,gas_generation,nuclear_generation,hydro_generation,renewable_generation
0,"DEMANDMET 1,17,810 MW","THERMAL GENERATION 77,689 MW","GAS GENERATION 6,246 MW","NUCLEAR GENERATION 4,714 MW","HYDRO GENERATION 13,512 MW","RENEWABLE GENERATION 16,080 MW"


In [131]:
import re

display(df2)
df2.info()
df2 = df2.applymap(lambda x: int(re.search(r'([\d,]+)', x).group(0).replace(',', '')))
df2['timestamps'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
df2 = df2[df2.columns[-1:].append(df2.columns[:-1])]
display(df2)
df2.info()

Unnamed: 0,demand_met,thermal_generation,gas_generation,nuclear_generation,hydro_generation,renewable_generation
0,"DEMANDMET 1,17,810 MW","THERMAL GENERATION 77,689 MW","GAS GENERATION 6,246 MW","NUCLEAR GENERATION 4,714 MW","HYDRO GENERATION 13,512 MW","RENEWABLE GENERATION 16,080 MW"


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   demand_met            1 non-null      object
 1   thermal_generation    1 non-null      object
 2   gas_generation        1 non-null      object
 3   nuclear_generation    1 non-null      object
 4   hydro_generation      1 non-null      object
 5   renewable_generation  1 non-null      object
dtypes: object(6)
memory usage: 176.0+ bytes


Unnamed: 0,timestamps,demand_met,thermal_generation,gas_generation,nuclear_generation,hydro_generation,renewable_generation
0,2020-04-29 14:27:20,117810,77689,6246,4714,13512,16080


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 7 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   timestamps            1 non-null      object
 1   demand_met            1 non-null      int64 
 2   thermal_generation    1 non-null      int64 
 3   gas_generation        1 non-null      int64 
 4   nuclear_generation    1 non-null      int64 
 5   hydro_generation      1 non-null      int64 
 6   renewable_generation  1 non-null      int64 
dtypes: int64(6), object(1)
memory usage: 184.0+ bytes


In [125]:
df2 = df2[df2.columns[-1:].append(df2.columns[:-1])]

Unnamed: 0,renewable_generation,timestamps,demand_met,thermal_generation,gas_generation,nuclear_generation,hydro_generation
0,2020-04-29 14:25:03,117810,77689,6246,4714,13512,16080


### The most reliable approach: using beautiful soup

Using Beautiful Soup to get directly to the tables before passing it to pandas.

In [3]:
import io
import re
import requests
from datetime import datetime
from bs4 import BeautifulSoup
import pandas as pd

URL = 'http://meritindia.in'

response = requests.get(URL)

df = None

if response.ok:
    soup = BeautifulSoup(response.content, 'html.parser')
    text = io.StringIO(str(soup.find_all(id='AllIndiaMap')[0]))
    df = pd.read_html(text)[1]

df.columns = ['demand_met', 'thermal_generation', 'gas_generation', 'nuclear_generation', 'hydro_generation', 'renewable_generation']
df = df.applymap(lambda x: int(re.search(r'([\d,]+)', x).group(0).replace(',', '')))

dt_now = datetime.now().replace(second=0, microsecond=0)

df['timestamps'] = dt_now.strftime('%Y-%m-%d %H:%M:%S')
df = df[df.columns[-1:].append(df.columns[:-1])]    

In [4]:
df

Unnamed: 0,timestamps,demand_met,thermal_generation,gas_generation,nuclear_generation,hydro_generation,renewable_generation
0,2020-04-30 17:14:00,128707,90917,6659,4032,22053,6273


We could also change the format to get all indicators as rows:

In [20]:
df2 = df.melt(id_vars=['timestamps'],var_name='indicator', value_name='value')
df2['unit'] = 'MW'
df2

Unnamed: 0,timestamps,indicator,value,unit
0,2020-04-29 16:37:38,demand_met,126894,MW
1,2020-04-29 16:37:38,thermal_generation,89871,MW
2,2020-04-29 16:37:38,gas_generation,6093,MW
3,2020-04-29 16:37:38,nuclear_generation,4614,MW
4,2020-04-29 16:37:38,hydro_generation,23686,MW
5,2020-04-29 16:37:38,renewable_generation,4151,MW


In [25]:
from pathlib import Path
from datetime import datetime

filename = Path('blablabla\in_meritindia_202004300105.html')
ts = datetime.strptime(filename.stem.split('_')[-1], '%Y%m%d%H%M')
india_tz = pytz.timezone('Asia/Kolkata')
date = india_tz.localize(ts)
date





datetime.datetime(2020, 4, 30, 1, 5, tzinfo=<DstTzInfo 'Asia/Kolkata' IST+5:30:00 STD>)

In [16]:
# thing to take into account: Indian time zone is GMT+5:30
import pytz
from datetime import datetime

print(' '.join(pytz.country_timezones['in']))
india_tz = pytz.timezone('Asia/Kolkata')
date = india_tz.localize(datetime.now())
date.strftime("'%Y%m%d%H%M%S %Z%z'")

Asia/Kolkata


"'20200430195451 IST+0530'"

In [21]:
india_tz = pytz.timezone('Asia/Kolkata')
in_time = datetime.now(india_tz)
print(sa_time.strftime('%Y-%m-%d_%H-%M-%S'))

2020-04-30_23-59-09


## Putting into production...

In [1]:
%cd ..

C:\Users\ROSA_L\PycharmProjects\scraper


In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import logging
import sys

root = logging.getLogger()
root.setLevel(logging.DEBUG)

handler = logging.StreamHandler(sys.stdout)
handler.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
root.addHandler(handler)

In [24]:
from scraper.jobs.in_meritindia.light_job import Job

# code for loading history
# india = Job(full_load=True)
# india.run(download=False)

# code for running regularly
india = Job()
india.run()

2020-05-29 15:55:06,294 - scraper.jobs.in_meritindia.light_job - DEBUG - full_load: None
2020-05-29 15:55:06,296 - scraper.jobs.in_meritindia.light_job - INFO - Downloading data from http://meritindia.in
2020-05-29 15:55:06,302 - urllib3.connectionpool - DEBUG - Starting new HTTP connection (1): proxy.iea.org:8080
2020-05-29 15:55:07,490 - urllib3.connectionpool - DEBUG - http://proxy.iea.org:8080 "GET http://meritindia.in/ HTTP/1.1" 200 415106
2020-05-29 15:55:10,593 - scraper.jobs.in_meritindia.light_job - INFO - Writing site content to C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia\2020\05\29\in_meritindia_202005291925.html
2020-05-29 15:55:10,602 - scraper.jobs.in_meritindia.light_job - INFO - Parsing the content of C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia\2020\05\29\in_meritindia_202005291925.html
2020-05-29 15:55:11,067 - scraper.jobs.in_meritindia.light_job - INFO - Writing to database.
2020-05-29 15:55:11,069 - scraper.jobs.in_meritindia.

In [77]:
from scraper.jobs.in_meritindia.light_job import *
import pandas as pd
from datetime import datetime

HISTORY_FILE = FILE_STORE_PATH / f'{FILE_PREFIX}_history.csv'

print(HISTORY_FILE)
df = pd.read_csv(HISTORY_FILE, parse_dates=['timestamps'], date_parser=lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
#mydateparser = lambda x: pd.datetime.strptime(x, "%Y %m %d %H:%M:%S")
#df = pd.read_csv("file.csv", sep='\t', names=['date_column', 'other_column'], parse_dates=['date_column'], date_parser=mydateparser)
df.info()

C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_history.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 137671 entries, 0 to 137670
Data columns (total 12 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   timestamps            137671 non-null  datetime64[ns]
 1   thermal_generation    137671 non-null  float64       
 2   gas_generation        137671 non-null  float64       
 3   g_co2_per_kwh         137671 non-null  float64       
 4   hydro_generation      137671 non-null  float64       
 5   nuclear_generation    137671 non-null  float64       
 6   renewable_generation  137671 non-null  float64       
 7   tons_co2              137671 non-null  float64       
 8   total_generation      137671 non-null  float64       
 9   tons_co2_per_mwh      137671 non-null  float64       
 10  demand_met            137671 non-null  float64       
 11  net_demand            137671 non-null  fl

In [78]:
df.head()

Unnamed: 0,timestamps,thermal_generation,gas_generation,g_co2_per_kwh,hydro_generation,nuclear_generation,renewable_generation,tons_co2,total_generation,tons_co2_per_mwh,demand_met,net_demand
0,2019-01-01 00:00:00,102238.5,4977.0,834.747057,6418.5,3659.0,4733.0,8488.4037,122026.0,0.834747,121651.0,117293.0
1,2019-01-01 00:05:00,101634.0,5058.0,836.056164,6087.0,3674.0,4721.0,8442.3558,121174.0,0.836056,120765.0,116453.0
2,2019-01-01 00:10:00,101529.0,5113.0,836.776475,5947.0,3667.0,4721.0,8435.8923,120977.0,0.836776,120395.0,116256.0
3,2019-01-01 00:15:00,101328.0,5072.5,835.714962,6083.0,3657.5,4733.5,8418.05235,120874.5,0.835715,120422.5,116141.0
4,2019-01-01 00:20:00,100906.0,5021.0,834.991627,6129.0,3646.0,4757.0,8381.8547,120459.0,0.834992,120180.0,115702.0


In [65]:
from scraper.jobs.in_meritindia.light_job import *

india.parse_meritindia_html(FILE_STORE_PATH / 'in_meritindia_202005010033.html')

2020-05-01 14:54:31,585 - scraper.jobs.in_meritindia.light_job - INFO - Parsing the content of C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_202005010033.html


Unnamed: 0,timestamps,demand_met,thermal_generation,gas_generation,nuclear_generation,hydro_generation,renewable_generation
0,2020-05-01 00:33:00,127917,90561,7239,4047,19813,7232


In [17]:
from scraper.settings import FILE_STORE_PATH

for file in FILE_STORE_PATH.glob('in_meritindia_*.html'):
    print(file)

C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_202005010033.html
C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_202005010039.html


In [11]:
COLUMNS_TO_KEEP = ['timestamps',
                   'demand_met',
                   'thermal_generation',
                   'gas_generation',
                   'nuclear_generation',
                   'hydro_generation',
                   'renewable_generation']

merge_difference = [f'target.{col} <> source.{col}' for col in COLUMNS_TO_KEEP if col != 'timestamps']

In [12]:
merge_difference

['target.demand_met <> source.demand_met',
 'target.thermal_generation <> source.thermal_generation',
 'target.gas_generation <> source.gas_generation',
 'target.nuclear_generation <> source.nuclear_generation',
 'target.hydro_generation <> source.hydro_generation',
 'target.renewable_generation <> source.renewable_generation']

In [13]:
' OR '.join(merge_difference)

'target.demand_met <> source.demand_met OR target.thermal_generation <> source.thermal_generation OR target.gas_generation <> source.gas_generation OR target.nuclear_generation <> source.nuclear_generation OR target.hydro_generation <> source.hydro_generation OR target.renewable_generation <> source.renewable_generation'

In [43]:
FINAL_TABLE_NAME = 'main.meritindia_data'
TEMP_TABLE_NAME = '#meritindia_tomerge'
merge_update = [f'target.{col} = source.{col}' for col in COLUMNS_TO_KEEP if col != 'timestamps']
merge_insert = [f'source.{col}' for col in COLUMNS_TO_KEEP]

print(merge_insert)

query = f'MERGE {FINAL_TABLE_NAME} target \n' + \
        f'USING dbo.{TEMP_TABLE_NAME} as source \n' + \
        'ON (target.timestamps = source.timestamps) \n' + \
        f"WHEN MATCHED AND ({' OR '.join(merge_difference)})\n" + \
        f"THEN UPDATE SET {', '.join(merge_update)}\n" + \
        f'WHEN NOT MATCHED \n' + \
        f"THEN INSERT ({', '.join(COLUMNS_TO_KEEP)}) \n" # +\
#        f"VALUES ({', '.join(merge_insert)})"

print(query)

['source.timestamps', 'source.demand_met', 'source.thermal_generation', 'source.gas_generation', 'source.nuclear_generation', 'source.hydro_generation', 'source.renewable_generation']
MERGE main.meritindia_data target 
USING dbo.#meritindia_tomerge as source 
ON (target.timestamps = source.timestamps) 
WHEN MATCHED AND (target.demand_met <> source.demand_met OR target.thermal_generation <> source.thermal_generation OR target.gas_generation <> source.gas_generation OR target.nuclear_generation <> source.nuclear_generation OR target.hydro_generation <> source.hydro_generation OR target.renewable_generation <> source.renewable_generation)
THEN UPDATE SET target.demand_met = source.demand_met, target.thermal_generation = source.thermal_generation, target.gas_generation = source.gas_generation, target.nuclear_generation = source.nuclear_generation, target.hydro_generation = source.hydro_generation, target.renewable_generation = source.renewable_generation
WHEN NOT MATCHED 
THEN INSERT (time

# Fix history
Move files to new directory structure.

In [23]:
import pathlib
from datetime import datetime
from scraper.settings import FILE_STORE_PATH
import pandas as pd

FILE_PREFIX = "in_meritindia"
ROOT_DIR = FILE_STORE_PATH / FILE_PREFIX
HISTORY_FILE = ROOT_DIR / f'{FILE_PREFIX}_history.csv'

START_DATE = datetime(2020, 5, 1)

# move history file
for f in FILE_STORE_PATH.glob(f'{FILE_PREFIX}_history.csv'):
    print(f'{f} -> {HISTORY_FILE}')
    f.replace(HISTORY_FILE)

# for each day since the start date, create directory and move files in
today = datetime.today()
for dt in pd.date_range(start = START_DATE, end = datetime.today()).to_pydatetime():
    p = ROOT_DIR / dt.strftime('%Y') / dt.strftime('%m') / dt.strftime('%d')
    print(p)
    if not p.exists():
        print(f'Creating {p}')
        p.mkdir(parents = True)
    
    nb_files = 0
    for f in FILE_STORE_PATH.glob(f"{FILE_PREFIX}_{dt.strftime('%Y%m%d')}*.html"):
        new_path = p / f.name
        print(f"{f} -> {new_path}")
        f.replace(new_path)
        nb_files += 1
    
    print(f'Number of files moved to {p}: {nb_files}')

C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia\2020\05\01
C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_202005010033.html -> C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia\2020\05\01\in_meritindia_202005010033.html
C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_202005010039.html -> C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia\2020\05\01\in_meritindia_202005010039.html
C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_202005010103.html -> C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia\2020\05\01\in_meritindia_202005010103.html
C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_202005011643.html -> C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia\2020\05\01\in_meritindia_202005011643.html
C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia_202005011645.html -> C:\Users\ROSA_L\PycharmProjects\scraper\filestore\in_meritindia\2020\05\