In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from datetime import datetime

import time

chrome_options = Options()
driver = webdriver.Chrome(options = chrome_options)


__author__ = 'aituarov'


DATAFILES_DIR = 'C:\\DEV\\REMIT_files\\'

url = 'https://www.elia.be/en/grid-data/power-generation/planned-and-unplanned-outages'

def download_snapshots():
    driver.get(url)
    WebDriverWait(driver,1000).until(EC.visibility_of_all_elements_located((By.XPATH, '//div[@class="k-grid k-widget k-display-block"]')))
    time.sleep(2)
    
    table_divs = driver.find_elements_by_xpath('//div[@class="k-grid k-widget k-display-block"]')
    for table_id, table in enumerate(table_divs):
        snapshot_fname = str(table_id) + '__' + datetime.today().strftime('%Y%m%d') + '.html'
        with open(DATAFILES_DIR + snapshot_fname, 'w') as snapshot_file:
            snapshot_file.write(table.get_attribute('outerHTML'))
            print(snapshot_fname + " downloaded.")
            snapshot_file.close()
            
    driver.quit()
            

def main():
    download_snapshots()
    



if __name__ == '__main__':

    main()    


In [252]:
import os
import shutil
import numpy as np
import pandas as pd
from lxml import html

REMIT_files = 'C:\\DEV\\REMIT_files\\'
DATAFILES_DIR = 'C:\\DEV\\REMIT_files\\'

REMIT_EMPTY = 'C:\\DEV\\REMIT_EMPTY\\'
REMIT_UNEMPTY = 'C:\\DEV\\REMIT_UNEMPTY\\'

def convert_html_files_to_df(folder):
    data = {}

    if not os.listdir(folder):
        print("There's no files")
    else:
        for file_name in os.listdir(folder):
            if file_name.endswith('html'):
                file_type = int(file_name[0])
                data_table = pd.DataFrame()
                with open(folder + file_name, 'r') as snapshot_file:
                    html_content = snapshot_file.read()
                    page = html.fromstring(html_content)
                    header = [th.text for th in page.xpath('./div//tr/th')[1:]]

                    content_row = page.xpath('./div//tbody/tr')
                    for row in content_row:
                        row_data = [td.text for td in row.xpath('./td')]

                        data_table = data_table.append([row_data], ignore_index=True)

                    data_table.columns = header
                    data[file_type] = data_table
                    print("File with file_id " + str(file_type) + " from " + folder + " is converted to df")
                    snapshot_file.close()
                    
    return data


def compare_df(old_data, new_data):
    comparison = {}
    
    old_uniq_data = {}
    new_uniq_data = {}
    
    for key in new_data.keys():
        data = pd.DataFrame()
        
        if key not in old_data.keys():
                old_data[key] = pd.DataFrame(columns=new_data[key].columns)
        
        old_uniq_data[key]=pd.merge(old_data[key], new_data[key], indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1).reset_index(drop=True)
        new_uniq_data[key]=pd.merge(old_data[key],new_data[key], indicator=True, how='outer').query('_merge=="right_only"').drop('_merge', axis=1).reset_index(drop=True)
        
        old_cnt = old_uniq_data[key].groupby(['Unit']).size().reset_index(name='counts')
        new_cnt = new_uniq_data[key].groupby(['Unit']).size().reset_index(name='counts')

        merged_df=old_cnt.merge(new_cnt, how='outer', on='Unit')
        
        comp_id = 1
        for index, row in merged_df.iterrows():
            df_old_match = old_uniq_data[key].loc[old_uniq_data[key]['Unit'] == row['Unit']]
            df_new_match = new_uniq_data[key].loc[new_uniq_data[key]['Unit'] == row['Unit']]

            if row['counts_x'] == row['counts_y'] == 1:
                df_stat = pd.DataFrame([['UPDATED', compid]])
                data = data.append(pd.concat([pd.concat([df_stat, df_old_match], axis=1), pd.concat([df_stat, df_new_match], axis=1)], axis=0))

            elif row['counts_x'] >= 1 and pd.isna(row['counts_y']):
                df_stat = pd.DataFrame([['REMOVED', comp_id]])
                data = data.append(pd.concat([df_stat, df_old_match], axis=1))
            
            elif pd.isna(row['counts_x']) and row['counts_y']>=1:
                df_stat = pd.DataFrame([['ADDED', comp_id]])
                data = data.append(pd.concat([df_stat, df_new_match], axis=1))
            
            else:
                df_stat = pd.DataFrame([[float('NaN'), float('NaN')]])
                data = data.append(pd.concat([df_stat, df_new_match], axis=1))

            comp_id+=1
        
        comparison[key] = data
        
    return comparison


def main():
    old_data = convert_html_files_to_df(REMIT_files)
    new_data = convert_html_files_to_df(REMIT_UNEMPTY)

    comparison = compare_df(old_data, new_data)
    for key in comparison.keys():
        print("Key is :" + str(key))
        print(comparison[key])
        
        if not comparison[key].empty:
            for file_name in os.listdir(REMIT_files):
                if file_name.startswith(str(key)):
                    os.remove(REMIT_files + file_name)
            
            for new_file_name in os.listdir(REMIT_UNEMPTY):
                if new_file_name.startswith(str(key)):
                    shutil.copyfile(REMIT_UNEMPTY + new_file_name, REMIT_files + new_file_name)
                    shutil.copyfile(REMIT_UNEMPTY + new_file_name, REMIT_files+"ARCHIVE\\" + new_file_name)
                    
            print("File with key " + str(key) + " downloaded to " + REMIT_files)
        
        else:
            for file_name in os.listdir(REMIT_UNEMPTY):
                if file_name.startswith(str(key)):
                    os.remove(REMIT_UNEMPTY + file_name)
                    print("File with key " + str(key) + " removed")
                    



if __name__ == '__main__':

    main()


File with file_id 0 from C:\DEV\REMIT_UNEMPTY\ is converted to df
File with file_id 1 from C:\DEV\REMIT_UNEMPTY\ is converted to df
Key is :0
       0    1                   Unit   Fuel Pmax Available Pmax Available   \
0  ADDED  1.0                    NaN    NaN            NaN             NaN   
2    NaN  NaN          DROGENBOS GT1     NG            150               0   
0  ADDED  2.0                    NaN    NaN            NaN             NaN   
1    NaN  NaN           DROGENBOS ST     NG            160               0   
0  ADDED  3.0  Zelzate 2 Knippegroen  Other            315               0   
3    NaN  NaN  Zelzate 2 Knippegroen  Other            315               0   

       Start Outage  (estimated) End       Last Updated                Reason  
0               NaN               NaN               NaN                   NaN  
2  29/07/2020 08:10  29/08/2020 08:10  29/07/2020 08:11              Overhaul  
0               NaN               NaN               NaN                

In [None]:
import os
import pandas as pd
from lxml import html

REMIT_files = 'C:\\DEV\\REMIT_files\\'
DATAFILES_DIR = 'C:\\DEV\\REMIT_files\\'

REMIT_EMPTY = 'C:\\DEV\\REMIT_EMPTY\\'
REMIT_UNEMPTY = 'C:\\DEV\\REMIT_UNEMPTY\\'

def convert_html_files_to_df(folder):
    data = {}

    if not os.listdir(folder):
        print("There's no files")
    else:
        for file in os.listdir(folder):
            if file.endswith('html'):
                file_type = int(file[0])
                data_table = pd.DataFrame()
                with open(file, 'r') as snapshot_file:
                    html_content = snapshot_file.read()
                    page = html.fromstring(html_content)
                    header = [th.text for th in page.xpath('./div//tr/th')[1:]]
                    content_row = page.xpath('./div//tbody/tr')
                    for row in content_row:
                        row_data = [td.text for td in row.xpath('./td')]
                        data_table = data_table.append([row_data], ignore_index=True)

                    data_table.columns = header
                    data[file_type] = data_table
                    print("File with file_id " + str(file_type) + " is converted to df")
                    snapshot_file.close()
                    
    return data


def main():
    old_data = convert_html_files_to_df(REMIT_files)
    print(old_data)
    new_data = convert_html_files_to_df(REMIT_UNEMPTY)
    print(new_data)

In [222]:
# a = pd.DataFrame([[1, 2, 3, 4], [1, 2, 3, 4]])
# b = pd.DataFrame([34, 45])
# c = a.loc[a[0] == 1]
# for row in c[:]:
#     print(row)
    
a = pd.DataFrame([[1, 2, 3, 4], [1, 2, 3, 4]])
b = pd.DataFrame([[5, 6]])
c = pd.DataFrame([[7, 8]])
e = pd.DataFrame([[5, 6]])
d = pd.DataFrame()

# print(a)
# print(b)

# b = b.loc[b.index.repeat(2)].reset_index(drop=True)
# print(b)

# print(pd.concat([a, b], axis=1))\

d = d.append(pd.concat([pd.concat([b, c], axis=1), pd.concat([b, e], axis=1)], axis=0))
print(d)

   0  1  0  1
0  5  6  7  8
0  5  6  5  6


In [246]:
dicti = {}
dicti[1] = 'sadfasdf'
if 2 in dicti.keys():
    print('OK')