In [1]:
# db_utils

import json
import pandas as pd
import traceback
import sqlalchemy
import os
import pyodbc
import sys
# import pymssql
import numpy as np

from MySQLdb._exceptions import OperationalError
from sqlalchemy import create_engine, exc,event
from urllib.parse import quote_plus

from time import time

#connection_string = "DRIVER={ODBC Driver 17 for SQL Server};SERVER=192.168.15.126;UID=BRS;PWD=Fint$123;Trusted Connection=yes;DATABASE="
connection_string = None
import logging
   
# logging = Logging()

class DB(object):
    def __init__(self, database, host='127.0.0.1', user='root', password='', port='3306', tenant_id=None):
        """
        Initialization of databse object.

        Args:
            databse (str): The database to connect to.
            host (str): Host IP address. For dockerized app, it is the name of
                the service set in the compose file.
            user (str): Username of MySQL server. (default = 'root')
            password (str): Password of MySQL server. For dockerized app, the
                password is set in the compose file. (default = '')
            port (str): Port number for MySQL. For dockerized app, the port that
                is mapped in the compose file. (default = '3306')
        """

        if host in ["common_db","extraction_db", "queue_db", "template_db", "table_db", "stats_db", "business_rules_db", "reports_db"]:
            self.HOST = os.environ['HOST_IP']
            self.USER = os.environ['LOCAL_DB_USER']
            self.PASSWORD = os.environ['LOCAL_DB_PASSWORD']
            self.PORT = os.environ['LOCAL_DB_PORT']
            self.DATABASE = f'{tenant_id}_{database}' if tenant_id is not None and tenant_id else database
        else:
            self.HOST = os.environ['HOST_IP']
            self.USER = os.environ['LOCAL_DB_USER']
            self.PASSWORD = os.environ['LOCAL_DB_PASSWORD']
            self.PORT = os.environ['LOCAL_DB_PORT']
            self.DATABASE = f'{tenant_id}_{database}' if tenant_id is not None and tenant_id else database
       
        logging.info(f'Host: {self.HOST}')
        logging.info(f'User: {self.USER}')
        logging.info(f'Password: {self.PASSWORD}')
        logging.info(f'Port: {self.PORT}')
        logging.info(f'Database: {self.DATABASE}')
        # self.connect()
    def connect(self, max_retry=5):
#         retry = 1

#         try:
#             start = time()
#             logging.debug(f'Making connection to {self.DATABASE}...')
#             config = f'mysql://{self.USER}:{self.PASSWORD}@{self.HOST}:{self.PORT}/{self.DATABASE}?charset=utf8'
#             self.db_ = create_engine(config, connect_args={'connect_timeout': 2}, pool_recycle=300)
#             logging.info(f'Engine created for {self.DATABASE}')
#             while retry <= max_retry:
#                 try:
#                     self.engine = self.db_.connect()
#                     logging.info(f'Connection established succesfully to {self.DATABASE}! ({round(time() - start, 2)} secs to connect)')
#                     break
#                 except Exception as e:
#                     logging.warning(f'Connection failed. Retrying... ({retry}) [{e}]')
#                     retry += 1
#                     self.db_.dispose()
#         except:
#             logging.exception(f'Something went wrong while connecting. Check trace.')
#             return
        data = []
        inds = [i for i in range(len(sql)) if sql[i] == '']
        print(inds)
        for pos, ind in enumerate(inds):
            if pos % 2 == 0:
                sql = sql[:ind] + '[' + sql[ind+1:]
            else:
                sql = sql[:ind] + ']' + sql[ind + 1:]
        if connection_string:
            try:
                conn = pyodbc.connect(connection_string + self.DATABASE)
            except Exception as e:
                print('Connection string invalid. ', e)
        else:
            try:
                if user_ or password_:
                    conn = pyodbc.connect('DRIVER={' + driver + '};SERVER=' + host_ + ';DATABASE=' + database+ ';UID=' + user_ + ';PWD=' + password_ + ';Trusted Connection=yes;')
                else:
                    conn = pyodbc.connect('DRIVER={' + driver + '};SERVER=' + host_ + ';DATABASE=' + database + ';Trusted Connection=yes;')
            except Exception as e:
                print("Error establishing connection to DB. ", e)
                conn = pyodbc.connect('DRIVER={' + driver + '};SERVER=' + host_ + ';DATABASE=' + database + ';Trusted Connection=yes;')

    def convert_to_mssql(self, query):
        inds = [i for i in range(len(query)) if query[i] == '`']
        for pos, ind in enumerate(inds):
            if pos % 2 == 0:
                query = query[:ind] + '[' + query[ind+1:]
            else:
                query = query[:ind] + ']' + query[ind + 1:]
       
        query = query.replace('%s', '?')

        return query

    def execute(self, query, database=None, index_col='id', **kwargs):
        logging.debug(f'Before converting: {query}')
        query = self.convert_to_mssql(query)
        logging.debug(f'After converting: {query}')

        logging.debug('Connecting to DB')
        conn = pyodbc.connect(f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={self.HOST};UID={self.USER};PWD={self.PASSWORD};Trusted Connection=yes;DATABASE={self.DATABASE}', as_dict=True)
        logging.debug(f'Connection established with {self.DATABASE}. [{conn}]')
        curs = conn.cursor()
        logging.debug(f'Cursor object created. [{curs}]')
        params = tuple(kwargs.get('params', []))
       
        logging.debug(f'Params: {params}')
        logging.debug(f'Params Type: {type(params)}')
        params = [int(i) if isinstance(i, np.int64) else i for i in params]
        curs.execute(query, params)
        logging.debug(f'Query executed.')
       
        data = None

        try:
            logging.debug(f'Fetching all data.')
            data = curs.fetchall()
            # logging.debug(f'Data fetched: {data}')
            columns = [column[0] for column in curs.description]
            logging.debug(f'Columns: {columns}')
            result = []
            for row in data:
                result.append(dict(zip(columns, row)))
            # logging.debug(f'Zipped result: {result}')
            if result:
                data = pd.DataFrame(result)
            else:
                data = pd.DataFrame(columns=columns)
            # logging.debug(f'Data to DF: {data}')
        except:
            logging.debug('Update Query')
        conn.commit()
        conn.close()
        if not isinstance(data, pd.DataFrame):
            logging.debug(f'Data is not a DataFrame. Returning True. [{type(data)}]')
            return True
       
        try:
            if index_col is not None:
                logging.debug(f'Setting ID as index')
                return data.where((pd.notnull(data)), None).set_index('id')
            else:
                return data.where((pd.notnull(data)), None)
        except:
            logging.exception(f'Failed to set ID as index')
            return data.where((pd.notnull(data)), None)

    def execute__(self, query, database=None, **kwargs):
        """
        Executes an SQL query.

        Args:
            query (str): The query that needs to be executed.
            database (str): Name of the database to execute the query in. Leave
                it none if you want use database during object creation.
            params (list/tuple/dict): List of parameters to pass to in the query.

        Returns:
            (DataFrame) A pandas dataframe containing the data from the executed
            query. (None if an error occurs)
        """
        data = None

#         # Use new database if a new databse is given
#         if database is not None:
#             try:
#                 config = f'mysql://{self.USER}:{self.PASSWORD}@{self.HOST}:{self.PORT}/{database}?charset=utf8'
#                 engine = create_engine(config, pool_recycle=300)
#             except:
#                 logging.exception(f'Something went wrong while connecting. Check trace.')
#                 return False
#         else:
#             engine = self.engine
       
        print('query', query)
        if database is None:
            database = 'karvy'
        data = None
        sql = query
        user_ = self.USER
        host_ = self.HOST
        database = self.DATABASE
        password_ = self.PASSWORD
        inds = [i for i in range(len(sql)) if sql[i] == '']
        for pos, ind in enumerate(inds):
            if pos % 2 == 0:
                sql = sql[:ind] + '[' + sql[ind+1:]
            else:
                sql = sql[:ind] + ']' + sql[ind + 1:]
               
        if connection_string:
            print('connection string', connection_string)
            print('database', database)
            print(type(connection_string + database))
            print(type(connection_string + database))

            try:
                conn = pyodbc.connect(connection_string + database)
            except Exception as e:
                print('Connection string invalid. ', e)
        else:
            try:
                if user_ or password_:
                    conn = pymssql.connect(host=host_,database=database,user=user_,password=password_)
                else:
                    conn = pymssql.connect(host=host_,database=database)
            except Exception as e:
                print("Error establishing connection to DB. ", e)
                conn = pymssql.connect(host=host_,database=database)
        try:
            logging.debug(f'Query: {query}')
            logging.debug(f'Query: {query}')
            curs = conn.cursor(as_dict = True)
            params = kwargs.get('params', [])
            logging.debug(f'Params: {params}')
            curs.execute(sql, tuple(params))
            print('query executed')
            try:
                data = curs.fetchall()
                data = pd.DataFrame(data)
                # print(data)
            except Exception as e:
                logging.debug('Update Query')
            # data = pd.read_sql(sql, conn, index_col='id', **kwargs)
        except exc.ResourceClosedError:
            logging.warning('Query does not have any value to return.')
            return True
        except (exc.StatementError, OperationalError) as e:
            logging.warning(f'Creating new connection. Engine/Connection is probably None. [{e}]')
            self.connect()
            data = pd.read_sql(query, self.engine, index_col='id', **kwargs)
        except:
            logging.exception('Something went wrong executing query. Check trace.')
            params = kwargs['params'] if 'params' in kwargs else None
            conn.rollback()
            return False
        conn.close()
        return data.where((pd.notnull(data)), None).set_index('id')

    def execute_(self, query, database=None, **kwargs):
        """
        Executes an SQL query.

        Args:
            query (str): The query that needs to be executed.
            database (str): Name of the database to execute the query in. Leave
                it none if you want use database during object creation.
            params (list/tuple/dict): List of parameters to pass to in the query.

        Returns:
            (DataFrame) A pandas dataframe containing the data from the executed
            query. (None if an error occurs)
        """
        logging.debug(f'Executing `execute` instead of `execute_`')
        return self.execute(query, index_col=None, **kwargs)
       
        data = None

#         # Use new database if a new database is given
#         if database is not None:
#             try:
#                 config = f'mysql://{self.USER}:{self.PASSWORD}@{self.HOST}:{self.PORT}/{database}?charset=utf8'
#                 engine = create_engine(config, pool_recycle=300)
#             except:
#                 logging.exception(f'Something went wrong while connecting. Check trace.')
#                 return False
#         else:
#             engine = self.engine

#         try:
#             data = pd.read_sql(query, engine, **kwargs)
#         except exc.ResourceClosedError:
#             return True
#         except:
#             logging.exception(f'Something went wrong while connecting. Check trace.')
#             params = kwargs['params'] if 'params' in kwargs else None
#             return False
        print('query', query)
        if database is None:
            database = 'karvy'
        data = None
        sql = query
        user_ = self.USER
        database = self.DATABASE
        host_ = self.HOST
        password_ = self.PASSWORD
        inds = [i for i in range(len(sql)) if sql[i] == '']
        # for pos, ind in enumerate(inds):
        #     if pos % 2 == 0:
        #         sql = sql[:ind] + '[' + sql[ind+1:]
        #     else:
        #         sql = sql[:ind] + ']' + sql[ind + 1:]
               
        if connection_string:
            print('connection string', connection_string)
            print('database', database)
            print(type(connection_string + database))
            print(type(connection_string + database))

            try:
                conn = pyodbc.connect(connection_string + database)
            except Exception as e:
                print('Connection string invalid. ', e)
        else:
            try:
                if user_ or password_:
                    conn = pymssql.connect(host=host_,database=database,user=user_,password=password_)
                else:
                    conn = pymssql.connect(host=host_,database=database)
            except Exception as e:
                print("Error establishing connection to DB. ", e)
                conn = pymssql.connect(host=host_,database=database)
        try:
            logging.debug(f'Query: {query}')
            curs = conn.cursor(as_dict = True)
            params = kwargs.get('params', [])
            logging.debug(f'Params: {params}')
            curs.execute(sql, params)
            print('query executed')
            try:
                data = curs.fetchall()
                data = pd.DataFrame(data)
                print(data)
            except Exception as e:
                logging.debug('Update Query')

            #data = pd.read_sql(sql, conn,**kwargs)
        except exc.ResourceClosedError:
            logging.warning('Query does not have any value to return.')
            return True
        except (exc.StatementError, OperationalError) as e:
            logging.warning(f'Creating new connection. Engine/Connection is probably None. [{e}]')
            self.connect()
            data = pd.read_sql(query, conn,**kwargs)
        except:
            logging.exception('Something went wrong executing query. Check trace.')
            params = kwargs['params'] if 'params' in kwargs else None
            conn.rollback()
            return False
        conn.commit()
        conn.close()
#         return data.where((pd.notnull(data)), None)
        try:
            return data.replace({pd.np.nan: None}).set_index('id')
        except AttributeError as e:
            return True

    def insert(self, data, table, **kwargs):
        """
        Write records stored in a DataFrame to a SQL database.

        Args:
            data (DataFrame): The DataFrame that needs to be write to SQL database.
            table (str): The table in which the rcords should be written to.
            kwargs: Keyword arguments for pandas to_sql function.
                See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html
                to know the arguments that can be passed.

        Returns:
            (bool) True is succesfully inserted, else false.
        """
        logging.info(f'Inserting into `{table}`')
       
        conn = f'DRIVER={{ODBC Driver 17 for SQL Server}};SERVER={self.HOST};UID={self.USER};PWD={self.PASSWORD};Trusted Connection=yes;DATABASE={self.DATABASE}'

       
#         conn =  "DRIVER={ODBC Driver 17 for SQL Server};SERVER=IP_ADDRESS;DATABASE=DataLake;UID=USER;PWD=PASS"
        quoted = quote_plus(conn)
        new_con = 'mssql+pyodbc:///?odbc_connect={}'.format(quoted)
        self.engine = create_engine(new_con)
#         print(self.engine)
        try:
            data.to_sql(table, self.engine,chunksize = None, **kwargs)
            try:
                self.execute(f'ALTER TABLE `{table}` ADD PRIMARY KEY (`id`);')
            except:
                pass
            return True
        except:
            logging.exception('Something went wrong inserting. Check trace.')
            return False

   
   
    def insert_(self, data, table, database=None, **kwargs):
        """
        Write records stored in a DataFrame to a SQL database.

        Args:
            data (DataFrame): The DataFrame that needs to be write to SQL database.
            table (str): The table in which the rcords should be written to.
            database (str): The database the table lies in. Leave it none if you
                want use database during object creation.
            kwargs: Keyword arguments for pandas to_sql function.
                See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html
                to know the arguments that can be passed.

        Returns:
            (bool) True is succesfully inserted, else false.
        """
        logging.info(f'Inserting into {table}')

        # # Use new database if a new databse is given
        # if database is not None:
        #     try:
        #         config = f'mysql://{self.USER}:{self.PASSWORD}@{self.HOST}:{self.PORT}/{database}?charset=utf8'
        #         engine = create_engine(config, pool_recycle=300)
        #     except:
        #         logging.exception(f'Something went wrong while connecting. Check trace.')
        #         return False
        # else:
        #     engine = self.engine

        if database is None:
            database = 'karvy'
        data = None
        sql = query
        user_ = self.USER
        database = self.DATABASE
        host_ = self.HOST
        password_ = self.PASSWORD
        inds = [i for i in range(len(sql)) if sql[i] == '']
        # for pos, ind in enumerate(inds):
        #     if pos % 2 == 0:
        #         sql = sql[:ind] + '[' + sql[ind+1:]
        #     else:
        #         sql = sql[:ind] + ']' + sql[ind + 1:]
               
        if connection_string:
            print('connection string', connection_string)
            print('database', database)
            print(type(connection_string + database))
            print(type(connection_string + database))

            try:
                conn = pyodbc.connect(connection_string + database)
            except Exception as e:
                print('Connection string invalid. ', e)
        else:
            try:
                if user_ or password_:
                    conn = pymssql.connect(host=host_,database=database,user=user_,password=password_)
                else:
                    conn = pymssql.connect(host=host_,database=database)
            except Exception as e:
                print("Error establishing connection to DB. ", e)
                conn = pymssql.connect(host=host_,database=database)
        try:
            logging.debug(f'Query: {query}')
            # data.to_sql(table, conn, **kwargs)
            curs = conn.cursor(as_dict = True)
            curs.execute(sql, tuple(kwargs.get('params', [])))
            print('query executed')
            try:
                data = curs.fetchall()
                data = pd.DataFrame(data)
                print(data)
            except Exception as e:
                logging.debug('Update Query')

            try:
                self.execute(f'ALTER TABLE {table} ADD PRIMARY KEY (id);')
            except:
                pass
            conn.commit()
            conn.close()
            return True
        except:
            logging.exception('Something went wrong inserting. Check trace.')
            return False

    def insert_dict(self, data, table):
        """
        Insert dictionary into a SQL database table.

        Args:
            data (DataFrame): The DataFrame that needs to be write to SQL database.
            table (str): The table in which the rcords should be written to.

        Returns:
            (bool) True is succesfully inserted, else false.
        """
        logging.info(f'Inserting dictionary data into {table}...')
        logging.debug(f'Data:\n{data}')

        try:
            column_names = []
            params = []

            for column_name, value in data.items():
                column_names.append(f'{column_name}')
                params.append(value)

            logging.debug(f'Column names: {column_names}')
            logging.debug(f'Params: {params}')

            columns_string = ', '.join(column_names)
            param_placeholders = ', '.join(['%s'] * len(column_names))

            query = f'INSERT INTO {table} ({columns_string}) VALUES ({param_placeholders})'

            return self.execute(query, params=params)
        except:
            logging.exception('Error inserting data.')
            return False

    def update(self, table, update=None, where=None, database=None, force_update=False):
        # Use new database if a new databse is given
        # if database is not None:
        #     try:
        #         config = f'mysql://{self.USER}:{self.PASSWORD}@{self.HOST}:{self.PORT}/{database}?charset=utf8'
        #         self.engine = create_engine(config, pool_recycle=300)
        #     except:
        #         logging.exception(f'Something went wrong while connecting. Check trace.')
        #         return False

        logging.info(f'Updating table: {table}')
        logging.info(f'Update data: {update}')
        logging.info(f'Where clause data: {where}')
        logging.info(f'Force update flag: {force_update}')

        try:
            set_clause = []
            set_value_list = []
            where_clause = []
            where_value_list = []

            if where is not None and where:
                for set_column, set_value in update.items():
                    set_clause.append(f'{set_column}=%s')
                    set_value_list.append(set_value)
                set_clause_string = ', '.join(set_clause)
            else:
                logging.error(f'Update dictionary is None/empty. Must have some update clause.')
                return False

            if where is not None and where:
                for where_column, where_value in where.items():
                    where_clause.append(f'{where_column}=%s')
                    where_value_list.append(where_value)
                where_clause_string = ' AND '.join(where_clause)
                query = f'UPDATE {table} SET {set_clause_string} WHERE {where_clause_string}'
            else:
                if force_update:
                    query = f'UPDATE {table} SET {set_clause_string}'
                else:
                    message = 'Where dictionary is None/empty. If you want to force update every row, pass force_update as True.'
                    logging.error(message)
                    return False

            params = set_value_list + where_value_list
            self.execute(query, params=params)
            return True
        except:
            logging.exception('Something went wrong updating. Check trace.')
            return False

    def get_column_names(self, table, database=None):
        """
        Get all column names from an SQL table.

        Args:
            table (str): Name of the table from which column names should be extracted.
            database (str): Name of the database in which the table lies. Leave
                it none if you want use database during object creation.

        Returns:
            (list) List of headers. (None if an error occurs)
        """
        try:
            logging.info(f'Getting column names of table {table}')
            return list(self.execute(f'SELECT * FROM {table}', database))
        except:
            logging.exception('Something went wrong getting column names. Check trace.')
            return

    def execute_default_index(self, query, database=None, **kwargs):
        """
        Executes an SQL query.

        Args:
            query (str): The query that needs to be executed.
            database (str): Name of the database to execute the query in. Leave
                it none if you want use database during object creation.
            params (list/tuple/dict): List of parameters to pass to in the query.

        Returns:
            (DataFrame) A pandas dataframe containing the data from the executed
            query. (None if an error occurs)
        """

        logging.debug(f'Executing `execute` instead of `execute_default_index`')
        return self.execute(query, index_col=None, **kwargs)
        data = None

        # # Use new database if a new databse is given
        # if database is not None:
        #     try:
        #         config = f'mysql://{self.USER}:{self.PASSWORD}@{self.HOST}:{self.PORT}/{database}?charset=utf8'
        #         engine = create_engine(config, pool_recycle=300)
        #     except:
        #         logging.exception(f'Something went wrong while connecting. Check trace.')
        #         return False
        # else:
        #     engine = self.engine

        print('query', query)
        if database is None:
            database = 'karvy'
        data = None
        sql = query
        user_ = self.USER
        host_ = self.HOST
        database = self.DATABASE
        password_ = self.PASSWORD
        inds = [i for i in range(len(sql)) if sql[i] == '']
        for pos, ind in enumerate(inds):
            if pos % 2 == 0:
                sql = sql[:ind] + '[' + sql[ind+1:]
            else:
                sql = sql[:ind] + ']' + sql[ind + 1:]
               
        if connection_string:
            print('connection string', connection_string)
            print('database', database)
            print(type(connection_string + database))
            print(type(connection_string + database))

            try:
                conn = pyodbc.connect(connection_string + database)
            except Exception as e:
                print('Connection string invalid. ', e)
        else:
            try:
                if user_ or password_:
                    conn = pymssql.connect(host=host_,database=database,user=user_,password=password_)
                else:
                    conn = pymssql.connect(host=host_,database=database)
            except Exception as e:
                print("Error establishing connection to DB. ", e)
                conn = pymssql.connect(host=host_,database=database)

        try:
            logging.debug(f'Query: {query}')
            # data.to_sql(table, conn, **kwargs)
            curs = conn.cursor(as_dict = True)
           
            curs.execute(sql, tuple(kwargs.get('params', [])))
            print('query executed')
            try:
                data = curs.fetchall()
                data = pd.DataFrame(data)
                print(data)
            except Exception as e:
                logging.debug('Update Query')
            # data = pd.read_sql(query, conn, **kwargs)
            conn.commit()
            conn.close()
        except exc.ResourceClosedError:
            return True
        except:
            logging.exception(f'Something went wrong while executing query. Check trace.')
            params = kwargs['params'] if 'params' in kwargs else None
            return False

        return data.where((pd.notnull(data)), None).set_index('id')


    def get_all(self, table, database=None, discard=None):
        """
        Get all data from an SQL table.

        Args:
            table (str): Name of the table from which data should be extracted.
            database (str): Name of the database in which the table lies. Leave
                it none if you want use database during object creation.
            discard (list): columns to be excluded while selecting all
        Returns:
            (DataFrame) A pandas dataframe containing the data. (None if an error
            occurs)
        """
        logging.info(f'Getting all data from {table}')
        if discard:
            logging.info(f'Discarding columns {discard}')
            columns = list(self.execute_default_index(f'SHOW COLUMNS FROM {table}',database).Field)
            columns = [col for col in columns if col not in discard]
            columns_str = json.dumps(columns).replace("'",'').replace('"','')[1:-1]
            return self.execute(f'SELECT {columns_str} FROM {table}', database)

        return self.execute(f'SELECT * FROM {table}', database)

    def get_latest(self, data, group_by_col, sort_col):
        """
        Group data by a column containing repeated values and get latest from it by
        taking the latest value based on another column.

        Example:
        Get the latest products
            id     product   date
            220    6647     2014-09-01
            220    6647     2014-10-16
            826    3380     2014-11-11
            826    3380     2015-05-19
            901    4555     2014-09-01
            901    4555     2014-11-01

        The function will return
            id     product   date
            220    6647     2014-10-16
            826    3380     2015-05-19
            901    4555     2014-11-01

        Args:
            data (DataFrame): Pandas DataFrame to query on.
            group_by_col (str): Column containing repeated values.
            sort_col (str): Column to identify the latest record.

        Returns:
            (DataFrame) Contains the latest records. (None if an error occurs)
        """
        try:
            logging.info('Grouping data...')
            logging.info(f'Data: {data}')
            logging.info(f'Group by column: {group_by_col}')
            logging.info(f'Sort column: {sort_col}')
            return data.sort_values(sort_col).set_index('id').groupby(group_by_col).tail(1)
        except KeyError as e:
            logging.error(f'Column {e.args[0]} does not exist.')
            return None
        except:
            logging.exception('Something went wrong while grouping data.')
            return None

db_config = {
   'host': '13.233.100.20',
   'port': '1433',
   'user': 'SA',
   'password':'Akhil@Akhil1'
}
import os
os.environ['HOST_IP'] = '13.233.100.20'
os.environ['LOCAL_DB_USER']='SA'
os.environ['LOCAL_DB_PASSWORD'] = 'Akhil@Akhil1'
os.environ['LOCAL_DB_PORT'] = '1433'




In [2]:
# helper functions
def debug_df(df, num=20):
    df.printSchema()
    df.show(num)
    

def decrease_date(s, days):
    date = datetime.datetime.strptime(s, "%Y-%m-%d")
    modified_date = date - datetime.timedelta(days=days)
    return datetime.datetime.strftime(modified_date, "%Y-%m-%d")

def read_df(table, columns_to_retrieve, database):
    
    query = f"SELECT {','.join(columns_to_retrieve)} from {table}"
    # logging.info(f"query to be executed is {query}")
    
    data = spark.read.format("jdbc") \
            .option("url", f"jdbc:sqlserver://{server}:{port};databaseName={database};") \
            .option("query", query) \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .load()
    return data

from datetime import timedelta, date
#
def daterange(start_date, end_date):
    start_date = datetime.datetime.strptime(start_date, "%Y-%m-%d")
    end_date = datetime.datetime.strptime(end_date, "%Y-%m-%d")
    for n in range(int((end_date - start_date).days)):
        yield datetime.datetime.strftime(start_date + timedelta(n), "%Y-%m-%d")

In [3]:
# imports
# required libraries
from pyspark import SparkContext, SparkConf #
from pyspark.sql import SparkSession # for dataframe conversions
# for type conversions
from datetime import datetime
from pyspark.sql.window import Window
from pyspark.sql.functions import col, udf, sum # col, udf (user defined functions)
from pyspark.sql.types import DateType, IntegerType # type
from pyspark.sql.types import *
from pyspark.sql.functions import trim # for trimming
from pyspark.sql.functions import collect_list, sort_array, row_number # for grouping and taking the last/first element
from pyspark.sql.functions import *
import sys
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import time
import datetime 
import multiprocessing


# intialize spark
conf = SparkConf()
conf.setMaster("local[*]").setAppName("InitRun").set("spark.sql.shuffle.partitions", 2*multiprocessing.cpu_count())
 
    

import pandas as pd
import numpy as np

from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import LongType

# Declare the function and create the UDF
def populate_db_func(purred: pd.Series, units: pd.Series) -> pd.Series:
    return pd.to_numeric(pd.Series(np.where(purred=='P', units, 0)))

def populate_cr_func(purred: pd.Series, units: pd.Series) -> pd.Series:
    return pd.to_numeric(pd.Series(np.where(purred=='R', units, 0)))

populate_db = pandas_udf(populate_db_func, returnType=FloatType())
populate_cr = pandas_udf(populate_cr_func, returnType=FloatType())    
    
#Create spark context and sparksession

SparkContext.setSystemProperty("spark.driver.memory", "60g")
SparkContext.setSystemProperty("spark.executor.memory", "60g")
SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
SparkContext.setSystemProperty("spark.executor.offHeap.size", "200g")
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession(sc)
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")


In [4]:
import multiprocessing
(2*multiprocessing.cpu_count())

16

In [5]:
# configs
import os
os.environ['HOST_IP'] = '192.168.15.126'
os.environ['LOCAL_DB_USER'] = 'BRS'
os.environ['LOCAL_DB_PASSWORD'] = 'Kfintech123$'
os.environ['LOCAL_DB_PORT'] = '1433'

import os

# comment when using the configs from env file
default_ip = '13.233.100.20'
default_user  = 'SA'
default_password = 'Akhil@Akhil1'
default_port = '1433'
default_tenant_id = 'karvy'

# # initializations 
server = os.environ.get('HOST_IP', default_ip)
port = os.environ.get('LOCAL_DB_PORT', default_port)
user = os.environ.get('LOCAL_DB_USER', default_user)
password = os.environ.get('LOCAL_DB_PASSWORD', default_password)

db_config = {
   'host': server,
   'port': port,
   'user': user,
   'password':password
}


def save_metric(date, metric_name, metric_value, fund_name, group_level, table_name, database='IB_Comp_funds'):
    db = DB(database, tenant_id='',**db_config)
    try:
        query = f"INSERT INTO `karvy_metrics` VALUES ( '{date}','{metric_name}','{metric_value}', '{table_name}', '{group_level}', '{fund_name}')"
        db.execute_(query)
    except Exception as e:
        logging.error("Unable to insert metrics data")
        logging.error(e)


#### initialize

In [6]:
%%time
def initialize(date_str, table, database='funds', date_column='BatchCloseDate', tenant_id='karvy',
              transaction_status='Active', purred = 'Purred', transaction_type = 'TransactionType',
              folio = 'Folio', purchase_units = 'DB_Units', redemption_units = 'Cr_Units',scheme = 'SchemeCode',
               plan = 'PlanCode',groupby_level='SP',ter_flag='TerFlag', direct_db=None):
    """Initialization script which takes in batch_close_date and populates opening balance
    Args:
        date(str) The date upto which we need to initialize
        table(str) 
        date_column(str)
        tenant_id(str)
    
    Note: batch_close_date is (yyyy-MM-dd) (for ex. 2020-04-30), inclusive 
    """
    # fundtable column names
    purchase_units = 'DB_Units'
    redemption_units = 'Cr_Units'
    scheme = 'SchemeCode'
    plan = 'PlanCode'
    
    # configurations we use
    batch_close_date = 'batch_close_date'
    db_units = 'purchase_units'
    cr_units = 'redemption_units'
    balance_units = 'balance_units'
    day_purchase_units = 'day_pu'
    day_redemption_units = 'day_ru'
    balance_purchase_units = 'balance_pu'
    balance_redemption_units = 'balance_ru'
    calculated_date = 'calculated_date'
    
#     inflow_db_units = 'inflow_purchase_units'
#     inflow_cr_units = 'inflow_redemption_units'
#     inflow_balance_units = 'inflow_balance_units'
#     inflow_day_purchase_units = 'inflow_day_pu'
#     inflow_day_redemption_units = 'inflow_day_ru'
#     inflow_balance_purchase_units = 'inflow_balance_pu'
#     inflow_balance_redemption_units = 'inflow_balance_ru'

    
#     outflow_db_units = 'outflow_purchase_units'
#     outflow_cr_units = 'outflow_redemption_units'
#     outlfow_balance_units = 'outlfow_balance_units'
#     outlfow_day_purchase_units = 'outlfow_day_pu'
#     outlfow_day_redemption_units = 'outlfow_day_ru'
#     outlfow_balance_purchase_units = 'outlfow_balance_pu'
#     outlfow_balance_redemption_units = 'outlfow_balance_ru'

    
    
    
    
    start = time.time()

    database = direct_db or (f'{tenant_id or default_tenant_id}_{database}')
    

    # read data
    data = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:{port};databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
#     data = data.filter(col(scheme) == 'TF')
    
#     data = data.cache()
    total_count = data.count()
    # debug_df(data, 10)
    
    # some preprocessings in the data, additional trimmings etc
    data = data.withColumn(transaction_status, upper(trim(col(transaction_status))))
    data = data.withColumn(purred, upper(trim(col(purred))))
    data = data.withColumn(folio, upper(trim(col(folio))))
    data = data.withColumn(scheme, upper(trim(col(scheme))))
    data = data.withColumn(plan, upper(trim(col(plan))))
    data = data.withColumn(transaction_type, upper(trim(col(transaction_type))))
    
#     data = data.filter((col(scheme).isin('IC','HC')))
    
#     print ('here')
    
    # cast the date column into dates, as we are concerned only with dates now
    data = data.withColumn(date_column, col(date_column).cast('date'))
    
    # filter the date till the batch_close_date (inclusive)
    data = data.filter(col(date_column) <= date_str)
        
    # filter the data according to rules
    data = data.filter((col(date_column).isNotNull()) )
    # data = data.filter((col(date_column) != '') ) # this will not work for few types
    data = data.filter( ~trim(col(date_column)).cast("string").eqNullSafe(''))
    
    # do must be rules
    data = data.filter( (trim(upper((col(transaction_status))))) == "Y")
    data = data.filter( (trim(upper(col(purred))) == "P") | (trim(upper(col(purred))) == "R") )
    data = data.withColumn(transaction_type, upper(trim(col(transaction_type))))
    
    # our configurations
    data = data.withColumn(batch_close_date, data[date_column])
    
#     print ('pur, redem')
    # bring in purchase and redemption units
    data = data.withColumn(db_units, when((col(purred) == "P"), col(purchase_units)).otherwise(0))
    data = data.withColumn(cr_units, when((col(purred) == "R"), col(redemption_units)).otherwise(0))
    

    
    
#     print ('1')
    # get the group by and window partitions based on partitions
    group_by_cols = []
    window_partition = []
    if groupby_level == 'SP':
        window_partition = [scheme, plan]
        group_by_cols = [scheme, plan, batch_close_date]
        # scheme_plan wise we might need to filter out some transaction types
        ignored_tr_types = ['CNI', 'CNO', 'TRMI', 
                            'TRMO', 'TRFI', 'TRFO', 'PLDO',
                            'UPLO', 'DMT', 'RMT', 'CNIR', 'CNOR', 'TRMIR', 'TRMOR',
                            'TRFIR', 'TRFOR', 'PLDOR', 'UPLOR', 'DMTR', 'RMTR']
        data = data.filter( ~(col(transaction_type).isin(ignored_tr_types)) )
    elif groupby_level == 'SPT':
        window_partition = [scheme, plan, transaction_type]
        group_by_cols  = [scheme, plan, transaction_type, batch_close_date]
    elif groupby_level == 'SPF':
        window_partition = [scheme, plan, folio]
        group_by_cols  = [scheme, plan, folio, batch_close_date]
    elif groupby_level == 'SPFB':
            window_partition = [scheme, plan, folio, broker]
            group_by_cols  = [scheme, plan, folio, broker, batch_close_date]
    elif groupby_level == 'SPFT':
        window_partition = [scheme, plan, folio, transaction_type]
        group_by_cols = [scheme, plan, folio, transaction_type, batch_close_date]
    elif groupby_level == 'SPFTTer':
        window_partition = [scheme, plan, folio, transaction_type, ter_flag]
        group_by_cols = [scheme, plan, folio, transaction_type, ter_flag, batch_close_date]
    elif groupby_level == 'SPFTB':
        window_partition = [scheme, plan, folio, transaction_type, broker]
        group_by_cols = [scheme, plan, folio, transaction_type, broker, batch_close_date]
        
        # populate broker code
        
        
        
    
    # roll up the data
    rolledup_data = data.groupBy(group_by_cols)
    rolledup_data = rolledup_data.agg({db_units:'sum', cr_units:'sum'})
        
    rolledup_data = rolledup_data.withColumnRenamed(f"sum({db_units})", day_purchase_units).withColumnRenamed(f"sum({cr_units})", day_redemption_units)
    rolledup_data = rolledup_data.withColumn(balance_purchase_units, sum(col(day_purchase_units)).over(Window.partitionBy(window_partition).orderBy(batch_close_date).rowsBetween(-sys.maxsize, 0)))
    rolledup_data = rolledup_data.withColumn(balance_redemption_units, sum(col(day_redemption_units)).over(Window.partitionBy(window_partition).orderBy(batch_close_date).rowsBetween(-sys.maxsize, 0)))
    rolledup_data = rolledup_data.withColumn(balance_units, (col(balance_purchase_units) - col(balance_redemption_units)))
    
#     print ('rolling up done')
    # get the latest data
    latest_data = rolledup_data.filter(col(batch_close_date) <= date_str)
    w = Window.partitionBy(window_partition).orderBy(col(batch_close_date).desc())
    latest_data = latest_data.withColumn("rrn", row_number().over(w)).where(col("rrn") == 1).drop("rrn")
    
    # maintained the calculated date (the latest data upto the calculated date)
    latest_data = latest_data.withColumn(calculated_date, lit(date_str).cast('date'))
    
    
    
    
    # store in parquet file for optimization of space and only one file and multi partitions
    # but write now store in csv and maintain date wise and colesce one
    #latest_data = latest_data.cache()
    
    
#     latest_data.show()
    # latest_data.coalesce(1).write.csv(f"{table}_latest/data_{groupby_level}_{date_str}.csv",header=True, mode='overwrite')
    latest_data.write.parquet(f"{table}_latest/data_{groupby_level}_{date_str}.parquet", mode='overwrite')
    
    print (f'inital file on date {date_str} written')
    return total_count


CPU times: user 10 µs, sys: 0 ns, total: 10 µs
Wall time: 16.9 µs


In [7]:
%%time
def initialize(date_str, table, database='funds', date_column='BatchCloseDate', tenant_id='karvy',
              transaction_status='Active', purred = 'Purred', transaction_type = 'TransactionType',
              folio = 'Folio', purchase_units = 'DB_Units', redemption_units = 'Cr_Units',scheme = 'SchemeCode',
               plan = 'PlanCode',groupby_level='SP',ter_flag='TerFlag', direct_db=None,
              broker_column = 'BrokerARN',transaction_no = 'TransactionNo',purchase_transaction_no = 'PurchaseTransactionNo',
              trans_table='Trans_116'):
    """Initialization script which takes in batch_close_date and populates opening balance
    Args:
        date(str) The date upto which we need to initialize
        table(str) 
        date_column(str)
        tenant_id(str)
    
    Note: batch_close_date is (yyyy-MM-dd) (for ex. 2020-04-30), inclusive 
    """
    # fundtable column names
    purchase_units = 'DB_Units'
    redemption_units = 'Cr_Units'
    scheme = 'SchemeCode'
    plan = 'PlanCode'
    
    # configurations we use
    batch_close_date = 'batch_close_date'
    db_units = 'purchase_units'
    cr_units = 'redemption_units'
    balance_units = 'balance_units'
    day_purchase_units = 'day_pu'
    day_redemption_units = 'day_ru'
    balance_purchase_units = 'balance_pu'
    balance_redemption_units = 'balance_ru'
    calculated_date = 'calculated_date'
    
#     inflow_db_units = 'inflow_purchase_units'
#     inflow_cr_units = 'inflow_redemption_units'
#     inflow_balance_units = 'inflow_balance_units'
#     inflow_day_purchase_units = 'inflow_day_pu'
#     inflow_day_redemption_units = 'inflow_day_ru'
#     inflow_balance_purchase_units = 'inflow_balance_pu'
#     inflow_balance_redemption_units = 'inflow_balance_ru'

    
#     outflow_db_units = 'outflow_purchase_units'
#     outflow_cr_units = 'outflow_redemption_units'
#     outlfow_balance_units = 'outlfow_balance_units'
#     outlfow_day_purchase_units = 'outlfow_day_pu'
#     outlfow_day_redemption_units = 'outlfow_day_ru'
#     outlfow_balance_purchase_units = 'outlfow_balance_pu'
#     outlfow_balance_redemption_units = 'outlfow_balance_ru'

    
    
    
    
    start = time.time()

    database = direct_db or (f'{tenant_id or default_tenant_id}_{database}')
    

    # read data
    data = spark.read.format("jdbc") \
    .option("url", f"jdbc:sqlserver://{server}:{port};databaseName={database};") \
    .option("dbtable", table) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
    .load()
#     data = data.filter(col(scheme) == 'TF')
    
#     data = data.cache()
    total_count = data.count()
    # debug_df(data, 10)
    
    # some preprocessings in the data, additional trimmings etc
    data = data.withColumn(transaction_status, upper(trim(col(transaction_status))))
    data = data.withColumn(purred, upper(trim(col(purred))))
    data = data.withColumn(folio, upper(trim(col(folio))))
    data = data.withColumn(scheme, upper(trim(col(scheme))))
    data = data.withColumn(plan, upper(trim(col(plan))))
    data = data.withColumn(broker_column, upper(trim(col(broker_column))))
    data = data.withColumn(transaction_type, upper(trim(col(transaction_type))))
    
#     data = data.filter((col(scheme).isin('IC','HC')))
    
#     print ('here')
    
    # cast the date column into dates, as we are concerned only with dates now
    data = data.withColumn(date_column, col(date_column).cast('date'))
    
    # filter the date till the batch_close_date (inclusive)
    data = data.filter(col(date_column) <= date_str)
        
    # filter the data according to rules
    data = data.filter((col(date_column).isNotNull()) )
    # data = data.filter((col(date_column) != '') ) # this will not work for few types
    data = data.filter( ~trim(col(date_column)).cast("string").eqNullSafe(''))
    
    # do must be rules
    data = data.filter( (trim(upper((col(transaction_status))))) == "Y")
    data = data.filter( (trim(upper(col(purred))) == "P") | (trim(upper(col(purred))) == "R") )
    data = data.withColumn(transaction_type, upper(trim(col(transaction_type))))
    
    # our configurations
    data = data.withColumn(batch_close_date, data[date_column])

    if 'B' in groupby_level:
    #     print (data.count())
        broker = 'broker_code'
    #     data.groupby(folio, scheme, plan).agg(count(broker_column), countDistinct(broker_column)).show()
        data = data.fillna({broker_column:'EMPTY_BROKER'})
        data =data.cache()
        # folios having single broker code will remain same
        multi_broker_folio = data.groupby(folio, scheme, plan).agg(countDistinct(broker_column)).filter(col(f'count({broker_column})') > 1)
        print (multi_broker_folio.count())
        print (data.count())
        single_broker_folio_data = data.join(multi_broker_folio.select(folio, scheme, plan), on=[folio, scheme, plan], how='left_anti')
        single_broker_data = single_broker_folio_data.withColumn(broker, col(broker_column))
        print (single_broker_folio_data.count())
        multi_broker_folio_data = data.join(multi_broker_folio.select(folio, scheme, plan), on=[folio, scheme, plan], how='left_semi')
        print (multi_broker_folio_data.count())
        # bring in correct broker codes
        # for purchase p they will be same
        data_p = multi_broker_folio_data.filter(col(purred) == 'P')
        data_p = data_p.withColumn(broker, col(broker_column))
        print (data_p.count())
        # for redemptions 
        data_r = multi_broker_folio_data.filter(col(purred) == 'R')
        print (data_r.count())
        # bring in the redemptions 
        query = f"SELECT * from {trans_table}"
        trans_data  = spark.read.format("jdbc") \
                .option("url", f"jdbc:sqlserver://{server}:{port};databaseName={database};") \
                .option("query", query) \
                .option("user", user) \
                .option("password", password) \
                .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
                .load()

        # get the redemptions
        trans_data = trans_data.join(data_r, on=[scheme, plan, folio, transaction_no], how='left_semi')
        print ('total trans', trans_data.count())

        grouped = data_p.groupby([transaction_no, broker_column]).count()
        trans_data = trans_data.drop(date_column)


        trans_data = trans_data.select(scheme, plan, folio, transaction_no, purchase_transaction_no).join(data_r,
                                     on=[scheme, plan, folio, transaction_no]).drop(transaction_no)

        trans_data = trans_data.join(data_p.select(transaction_no, broker), trans_data[purchase_transaction_no] == data_p[transaction_no], how='left').drop(purchase_transaction_no)

        data = single_broker_data.union(data_p.select(single_broker_data.columns)).union(trans_data.select(single_broker_data.columns))
        print ('final data', data.count())
    
    
#     print ('pur, redem')
    # bring in purchase and redemption units
    data = data.withColumn(db_units, when((col(purred) == "P"), col(purchase_units)).otherwise(0))
    data = data.withColumn(cr_units, when((col(purred) == "R"), col(redemption_units)).otherwise(0))
#     data = data.withColumn(db_units, populate_db(col(purred), col(purchase_units)))
#     data = data.withColumn(cr_units, populate_cr(col(purred), col(redemption_units)))

    
    
#     print ('1')
    # get the group by and window partitions based on partitions
    group_by_cols = []
    window_partition = []
    if groupby_level == 'SP':
        window_partition = [scheme, plan]
        group_by_cols = [scheme, plan, batch_close_date]
        # scheme_plan wise we might need to filter out some transaction types
        ignored_tr_types = ['CNI', 'CNO', 'TRMI', 
                            'TRMO', 'TRFI', 'TRFO', 'PLDO',
                            'UPLO', 'DMT', 'RMT', 'CNIR', 'CNOR', 'TRMIR', 'TRMOR',
                            'TRFIR', 'TRFOR', 'PLDOR', 'UPLOR', 'DMTR', 'RMTR']
        data = data.filter( ~(col(transaction_type).isin(ignored_tr_types)) )
    elif groupby_level == 'SPT':
        window_partition = [scheme, plan, transaction_type]
        group_by_cols  = [scheme, plan, transaction_type, batch_close_date]
    elif groupby_level == 'SPF':
        window_partition = [scheme, plan, folio]
        group_by_cols  = [scheme, plan, folio, batch_close_date]
    elif groupby_level == 'SPFB':
            window_partition = [scheme, plan, folio, broker]
            group_by_cols  = [scheme, plan, folio, broker, batch_close_date]
    elif groupby_level == 'SPFT':
        window_partition = [scheme, plan, folio, transaction_type]
        group_by_cols = [scheme, plan, folio, transaction_type, batch_close_date]
    elif groupby_level == 'SPFTTer':
        data = data.fillna({ter_flag:'EMPTY_TER'})
        window_partition = [scheme, plan, folio, transaction_type, ter_flag]
        group_by_cols = [scheme, plan, folio, transaction_type, ter_flag, batch_close_date]
    elif groupby_level == 'SPFTBTer':
        data = data.fillna({ter_flag:'EMPTY_TER'})
        window_partition = [scheme, plan, folio, transaction_type, ter_flag, broker]
        group_by_cols = [scheme, plan, folio, transaction_type, ter_flag, broker, batch_close_date]
    elif groupby_level == 'SPFTB':
        window_partition = [scheme, plan, folio, transaction_type, broker]
        group_by_cols = [scheme, plan, folio, transaction_type, broker, batch_close_date]
        
        # populate broker code
        
        
        
    
    # roll up the data
    rolledup_data = data.groupBy(group_by_cols)
    rolledup_data = rolledup_data.agg({db_units:'sum', cr_units:'sum'})
        
    rolledup_data = rolledup_data.withColumnRenamed(f"sum({db_units})", day_purchase_units).withColumnRenamed(f"sum({cr_units})", day_redemption_units)
    rolledup_data = rolledup_data.withColumn(balance_purchase_units, sum(col(day_purchase_units)).over(Window.partitionBy(window_partition).orderBy(batch_close_date).rowsBetween(-sys.maxsize, 0)))
    rolledup_data = rolledup_data.withColumn(balance_redemption_units, sum(col(day_redemption_units)).over(Window.partitionBy(window_partition).orderBy(batch_close_date).rowsBetween(-sys.maxsize, 0)))
    rolledup_data = rolledup_data.withColumn(balance_units, (col(balance_purchase_units) - col(balance_redemption_units)))
    
#     print ('rolling up done')
    # get the latest data
    latest_data = rolledup_data.filter(col(batch_close_date) <= date_str)
    w = Window.partitionBy(window_partition).orderBy(col(batch_close_date).desc())
    latest_data = latest_data.withColumn("rrn", row_number().over(w)).where(col("rrn") == 1).drop("rrn")
    
    # maintained the calculated date (the latest data upto the calculated date)
    latest_data = latest_data.withColumn(calculated_date, lit(date_str).cast('date'))
    
    
    
    
    # store in parquet file for optimization of space and only one file and multi partitions
    # but write now store in csv and maintain date wise and colesce one
    # latest_data = latest_data.cache()
    
    
#     latest_data.show()
    # latest_data.coalesce(1).write.csv(f"{table}_latest/data_{groupby_level}_{date_str}.csv",header=True, mode='overwrite')
    latest_data.write.parquet(f"{table}_latest/data_{groupby_level}_{date_str}.parquet", mode='overwrite')
    
    print (f'inital file on date {date_str} written')
    return total_count

code='116'
name='AXA'
print (code, name)
start = time.time()
table = f'm_Trans_{code}'
trans_table = f'Trans_{code}'
groupby_level='SPFTBTer'
init_date = '2020-08-31'
start_date = '2020-04-01'
end_date = '2020-09-02'
#         mcr_month_date = '2020-05-01'

# records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level, trans_table=trans_table)

116 AXA
CPU times: user 593 µs, sys: 0 ns, total: 593 µs
Wall time: 511 µs


#### dialyjob

In [8]:
%%time
import time
def dialy_job(date_str, table='trans116', database='funds', date_column='BatchCloseDate', tenant_id='karvy',
              transaction_status='Active', purred = 'Purred', transaction_type = 'TransactionType',
              folio = 'Folio', purchase_units = 'DB_Units', redemption_units = 'Cr_Units',scheme = 'SchemeCode',
               plan = 'PlanCode',groupby_level='SP', direct_db=None, fn_fromdt = 'fn_fromdt',fn_fromdt_format = 'dd/MM/yyyy',
              fn_scheme = 'fn_scheme',fn_plan = 'fn_plan', fn_nav = 'fn_nav', nav_table='nav_master', 
              scheme_table='scheme_master',scheme_code = 'scheme_code', 
              plan_code = 'plan_code', category = 'SebiSchemeCategory',
              subcategory = 'SebiSchemeSubCategory',nature = 'nature', newmcrid='NewMCRId', ter_flag='TerFlag',
              broker_column = 'BrokerARN',transaction_no = 'TransactionNo',purchase_transaction_no = 'PurchaseTransactionNo',
              trans_table='Trans_116'
             ):
    """Dialy run this and store the latest data and aum data too"""
    
    # inflow outflow
    inflow_db_trtypes = ['NEW', 'ADD', 'IPO', 'SIN', 'LTIN', 'LTIA', 'STPN', 'STPA', 'STPI','DIR', 'DSPI', 'SWIN','SWIA']
    inflow_cr_trtypes = ['NEWR', 'ADDR', 'IPOR', 'SINR', 'LTINR', 'LTIAR', 'STPNR', 
                         'STPAR', 'STPIR','DIRR', 'DSPIR', 'SWINR','SWIAR']
    
    outflow_db_trtypes = ['FULR', 'REDR', 'LTOFR', 'LTOPR', 'STPOR', 'SWDR', 'TRGR', 'SWOPR', 'SWOFR']
    outflow_cr_trtypes = ['FUL', 'RED', 'SWD','TRG','LTOF', 'LTOP','STPO', 'SWOP', 'SWOF']
    
    
    
    # configurations we use
    batch_close_date = 'batch_close_date'
    db_units = 'purchase_units'
    cr_units = 'redemption_units'
    balance_units = 'balance_units'
    day_purchase_units = 'day_pu'
    day_redemption_units = 'day_ru'
    balance_purchase_units = 'balance_pu'
    balance_redemption_units = 'balance_ru'
    calculated_date = 'calculated_date'
    today_pu = 'today_pu'
    today_ru = 'today_ru'
    effective_nav = 'effective_nav'
    aum = 'aum'
    aaum = 'aaum'
    inflow = 'inflow'
    outflow = 'outflow'
    inflow_db_units = 'inflow_purchase_units'
    inflow_cr_units = 'inflow_redemption_units'
    outflow_db_units = 'outflow_purchase_units'
    outflow_cr_units = 'outflow_redemption_units'
    inflow_units = 'inflow_units'
    outflow_units = 'outflow_units'
    
    # get the latest data from the previously stored file
    date_obj = datetime.datetime.strptime(date_str, '%Y-%m-%d')
    day_num = date_obj.day
    previous_day = date_obj - datetime.timedelta(1)
    previous_day_str = previous_day.strftime('%Y-%m-%d')
    latest_data = spark.read.parquet(f"{table}_latest/data_{groupby_level}_{previous_day_str}.parquet")

    # debug_df(latest_data)
    
    # get  the todays data
    database = direct_db or (f'{tenant_id or default_tenant_id}_{database}')
    query = f"SELECT * from {table} where CAST({date_column} AS DATE)='{date_str}'"
    data  = spark.read.format("jdbc") \
            .option("url", f"jdbc:sqlserver://{server}:{port};databaseName={database};") \
            .option("query", query) \
            .option("user", user) \
            .option("password", password) \
            .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
            .load()
#     data = data.filter(col(scheme) == 'TF')
    
    data = data.cache()
    
#     data = data.filter((col(scheme)=='IC'))
    # latest_data = latest_data.filter((col(scheme) == 'OV') & (col(plan) == 'RG'))
    # data = data.filter((col(scheme) == 'OV') & (col(plan) == 'RG'))
    # print (data.count())
    # debug_df(data)
    
    # calculate all the steps as in initialization
    # some preprocessings in the data, additional trimmings etc
    data = data.withColumn(transaction_status, upper(trim(col(transaction_status))))
    data = data.withColumn(purred, upper(trim(col(purred))))
    data = data.withColumn(folio, upper(trim(col(folio))))
    data = data.withColumn(scheme, upper(trim(col(scheme))))
    data = data.withColumn(plan, upper(trim(col(plan))))
    data = data.withColumn(transaction_type, upper(trim(col(transaction_type))))
    
    
    day_count = data.count()
#     data = data.filter((col(scheme).isin('IC','HC')))
    

    # cast the date column into dates, as we are concerned only with dates now
    data = data.withColumn(date_column, col(date_column).cast('date'))
    
   
    # filter the date till the batch_close_date (inclusive)
    # data = data.filter(col(date_column) <= date_str)
    
        
    # filter the data according to rules
    data = data.filter((col(date_column).isNotNull()) )
    # data = data.filter((col(date_column) != '') ) # this will not work for few types
    data = data.filter( ~trim(col(date_column)).cast("string").eqNullSafe(''))
    
    # do must be rules
    data = data.filter( (trim(upper((col(transaction_status))))) == "Y")
    data = data.filter( (trim(upper(col(purred))) == "P") | (trim(upper(col(purred))) == "R") )
    data = data.withColumn(transaction_type, upper(trim(col(transaction_type))))
    
    # our configurations
    data = data.withColumn(batch_close_date, data[date_column])
    

    if 'B' in groupby_level:
    #     print (data.count())
        broker = 'broker_code'
    #     data.groupby(folio, scheme, plan).agg(count(broker_column), countDistinct(broker_column)).show()
        data = data.fillna({broker_column:'EMPTY_BROKER'})

        # folios having single broker code will remain same
        multi_broker_folio = data.groupby(folio, scheme, plan).agg(countDistinct(broker_column)).filter(col(f'count({broker_column})') > 1)

        single_broker_folio_data = data.join(multi_broker_folio.select(folio, scheme, plan), on=[folio, scheme, plan], how='left_anti')
        single_broker_data = single_broker_folio_data.withColumn(broker, col(broker_column))

        multi_broker_folio_data = data.join(multi_broker_folio.select(folio, scheme, plan), on=[folio, scheme, plan], how='left_semi')
        # bring in correct broker codes
        # for purchase p they will be same
        data_p = multi_broker_folio_data.filter(col(purred) == 'P')
        data_p = data_p.withColumn(broker, col(broker_column))

        # for redemptions 
        data_r = multi_broker_folio_data.filter(col(purred) == 'R')
        # bring in the redemptions 
        query = f"SELECT * from {trans_table}"
        trans_data  = spark.read.format("jdbc") \
                .option("url", f"jdbc:sqlserver://{server}:{port};databaseName={database};") \
                .option("query", query) \
                .option("user", user) \
                .option("password", password) \
                .option("driver", "com.microsoft.sqlserver.jdbc.SQLServerDriver") \
                .load()

        # get the redemptions
        trans_data = trans_data.join(data_r, on=[scheme, plan, folio, transaction_no], how='left_semi')

        grouped = data_p.groupby([transaction_no, broker_column]).count()
        trans_data = trans_data.drop(date_column)


        trans_data = trans_data.select(scheme, plan, folio, transaction_no, purchase_transaction_no).join(data_r,
                                     on=[scheme, plan, folio, transaction_no]).drop(transaction_no)

        trans_data = trans_data.join(data_p.select(transaction_no, broker), trans_data[purchase_transaction_no] == data_p[transaction_no], how='left').drop(purchase_transaction_no)

        data = single_broker_data.union(data_p.select(single_broker_data.columns)).union(trans_data.select(single_broker_data.columns))
    
    
    
#     print ('pur, redem')
    # bring in purchase and redemption units
    data = data.withColumn(db_units, when((col(purred) == "P"), col(purchase_units)).otherwise(0))
    data = data.withColumn(cr_units, when((col(purred) == "R"), col(redemption_units)).otherwise(0))
#     data = data.withColumn(db_units, populate_db(col(purred), col(purchase_units)))
#     data = data.withColumn(cr_units, populate_cr(col(purred), col(redemption_units)))

    
    
#     print ('1')
    # get the group by and window partitions based on partitions
    group_by_cols = []
    window_partition = []
    if groupby_level == 'SP':
        window_partition = [scheme, plan]
        group_by_cols = [scheme, plan, batch_close_date]
        # scheme_plan wise we might need to filter out some transaction types
        ignored_tr_types = ['CNI', 'CNO', 'TRMI', 
                            'TRMO', 'TRFI', 'TRFO', 'PLDO',
                            'UPLO', 'DMT', 'RMT', 'CNIR', 'CNOR', 'TRMIR', 'TRMOR',
                            'TRFIR', 'TRFOR', 'PLDOR', 'UPLOR', 'DMTR', 'RMTR']
        data = data.filter( ~(col(transaction_type).isin(ignored_tr_types)) )
    elif groupby_level == 'SPT':
        window_partition = [scheme, plan, transaction_type]
        group_by_cols  = [scheme, plan, transaction_type, batch_close_date]
    elif groupby_level == 'SPF':
        window_partition = [scheme, plan, folio]
        group_by_cols  = [scheme, plan, folio, batch_close_date]
    elif groupby_level == 'SPFB':
            window_partition = [scheme, plan, folio, broker]
            group_by_cols  = [scheme, plan, folio, broker, batch_close_date]
    elif groupby_level == 'SPFT':
        window_partition = [scheme, plan, folio, transaction_type]
        group_by_cols = [scheme, plan, folio, transaction_type, batch_close_date]
    elif groupby_level == 'SPFTTer':
        data = data.fillna({ter_flag:'EMPTY_TER'})
        window_partition = [scheme, plan, folio, transaction_type, ter_flag]
        group_by_cols = [scheme, plan, folio, transaction_type, ter_flag, batch_close_date]
    elif groupby_level == 'SPFTBTer':
        window_partition = [scheme, plan, folio, transaction_type, ter_flag, broker]
        group_by_cols = [scheme, plan, folio, transaction_type, ter_flag, broker, batch_close_date]
    elif groupby_level == 'SPFTB':
        window_partition = [scheme, plan, folio, transaction_type, broker]
        group_by_cols = [scheme, plan, folio, transaction_type, broker, batch_close_date]
   
    
    
    # roll up the data
    rolledup_data = data.groupBy(group_by_cols)
    rolledup_data = rolledup_data.agg({db_units:'sum', cr_units:'sum'})
        
    rolledup_data = rolledup_data.withColumnRenamed(f"sum({db_units})", day_purchase_units).withColumnRenamed(f"sum({cr_units})", day_redemption_units)

    # inflow outflow units
    
    
    latest_data = latest_data.drop(day_purchase_units, day_redemption_units, batch_close_date, balance_units)
    latest_data = latest_data.withColumnRenamed(balance_purchase_units, day_purchase_units)
    latest_data = latest_data.withColumnRenamed(balance_redemption_units, day_redemption_units)
    latest_data = latest_data.withColumnRenamed(calculated_date, batch_close_date)
    
    # debug_df(rolledup_data)
    # debug_df(latest_data)
    combined_data = latest_data.union(rolledup_data.select(latest_data.columns))
    combined_data = combined_data.cache()
    combined_count = combined_data.count()
    # debug_df(rolledup_data)
    # debug_df(latest_data)
    # debug_df(combined_data)
    combined_data = combined_data.withColumn(balance_purchase_units, sum(col(day_purchase_units)).over(Window.partitionBy(window_partition).orderBy(batch_close_date).rowsBetween(-sys.maxsize, 0)))
    combined_data = combined_data.withColumn(balance_redemption_units, sum(col(day_redemption_units)).over(Window.partitionBy(window_partition).orderBy(batch_close_date).rowsBetween(-sys.maxsize, 0)))
    combined_data = combined_data.withColumn(balance_units, (col(balance_purchase_units) - col(balance_redemption_units)))
    # debug_df(combined_data)
    
    
    # store the latest day data again
    # get the latest data
    combined_data = combined_data.filter(col(batch_close_date) <= date_str)
    w = Window.partitionBy(window_partition).orderBy(col(batch_close_date).desc())
    combined_data = combined_data.withColumn("rrn", row_number().over(w)).where(col("rrn") == 1).drop("rrn")
    
    
    # maintained the calculated date (the latest data upto the calculated date)
    combined_data = combined_data.withColumn(calculated_date, lit(date_str).cast('date'))
    
    # debug_df(combined_data)
    # store in parquet file for optimization of space and only one file and multi partitions
    # but write now store in csv and maintain date wise and colesce one
    combined_data = combined_data.cache()
    # combined_data.coalesce(1).write.csv(f"{table}_latest/data_{groupby_level}_{date_str}.csv",header=True, mode='overwrite')
    combined_data.write.parquet(f"{table}_latest/data_{groupby_level}_{date_str}.parquet", mode='overwrite')
    
    
    # join the nav, scheme_master data
    nav_data = read_df(nav_table, '*', database)
    nav_data = nav_data.withColumnRenamed(fn_scheme, scheme)
    nav_data = nav_data.withColumnRenamed(fn_plan, plan)
    nav_data = nav_data.withColumn(fn_fromdt, col(fn_fromdt).cast('date'))

    scheme_master = read_df(scheme_table, [scheme_code, plan_code, nature, category, subcategory, newmcrid], database)
    scheme_master = scheme_master.withColumnRenamed(scheme_code, scheme)
    scheme_master = scheme_master.withColumnRenamed(plan_code, plan)

    scheme_master_ = scheme_master.dropDuplicates([scheme, plan])
    nav_scheme = nav_data.join(scheme_master_, on=[scheme, plan], how='left')
    nav_data = nav_scheme
    
    # debug_df(nav_data)
    
    # calculate the aum 
    combined_data = combined_data.withColumn(effective_nav, date_sub(col(calculated_date), 1))
    combined_data = combined_data.withColumn(today_pu, when((col(calculated_date) == col(batch_close_date)), col(day_purchase_units)).otherwise(0))
    combined_data = combined_data.withColumn(today_ru, when((col(calculated_date) == col(batch_close_date)), col(day_redemption_units)).otherwise(0))
    
    # inflow outflow addition
    inflow_db_condition = col(transaction_type).isin(inflow_db_trtypes)
    inflow_cr_condition = col(transaction_type).isin(inflow_cr_trtypes)
    combined_data = combined_data.withColumn(inflow_db_units, when(inflow_db_condition, col(today_pu)).otherwise(0))
    combined_data = combined_data.withColumn(inflow_cr_units, when(inflow_cr_condition, col(today_ru)).otherwise(0))
    
    outflow_db_condition = col(transaction_type).isin(outflow_db_trtypes)
    outflow_cr_condition = col(transaction_type).isin(outflow_cr_trtypes)
    combined_data = combined_data.withColumn(outflow_db_units, when(outflow_db_condition, col(today_pu)).otherwise(0))
    combined_data = combined_data.withColumn(outflow_cr_units, when(outflow_cr_condition, col(today_ru)).otherwise(0))
    
    combined_data = combined_data.withColumn(inflow_units, col(inflow_db_units) - col(inflow_cr_units))
    combined_data = combined_data.withColumn(outflow_units, col(outflow_cr_units) - col(outflow_db_units))
    
    # debug_df(combined_data)
    
    nav_filteredFT = nav_data.filter(col(fn_fromdt) < date_str)
    navw = Window.partitionBy([scheme, plan]).orderBy(col(fn_fromdt).desc())
    nav_populate = nav_filteredFT.withColumn("rrn", row_number().over(navw)).where(col("rrn") == 1).drop("rrn")
    nav_populate = nav_populate.withColumn(calculated_date, lit(date_str))
    nav_populate = nav_populate.select([scheme, plan, fn_fromdt, fn_nav, calculated_date, category, subcategory, nature, newmcrid])
    
#     debug_df(nav_populate)
    
    joined = combined_data.join(nav_populate, on=[scheme, plan, calculated_date], how='left')
    joined = joined.withColumn(aum, col(fn_nav) * col(balance_units))
    joined = joined.withColumn(inflow, col(fn_nav) * col(inflow_units))
    joined = joined.withColumn(outflow, col(fn_nav) * col(outflow_units))
    
#     aum_dummy = f'{aum}_d'
#     final_joined = joined.withColumn(aum_dummy, col(aum))
#     final_joined = final_joined.fillna({aum_dummy: 0})
    
#     # moving average logic
#     print (day_num)
#     if day_num == 1:
#         final_joined = final_joined.withColumn(f'pre_{aum_dummy}', lit(0))
#         final_joined = final_joined.withColumn(aaum, col(aum_dummy))
#     elif day_num == 2:
#         final_joined = final_joined.withColumn(f'pre_{aum_dummy}', lit(0))
#         final_joined = final_joined.withColumn(aaum, col(aum_dummy))
#     else:
#         previous_day_aum = spark.read.parquet(f"{table}_dialy/aaum_data_{groupby_level}_{previous_day_str}.parquet")
#         previous_day_aum.withColumnRenamed(aum_dummy, f'pre_{aum_dummy}')
#         final_joined = final_joined.join(previous_day_aum.select(window_partition + [f'pre_{aum_dummy}']), on=window_partition, how='left')
#         final_joined = final_joined.fillna({f'pre_{aum_dummy}': 0})
#         final_joined = final_joined.withColumn(aaum, (col(aum_dummy) + col(f'pre_{aum_dummy}') / day_num))
        
    
#     debug_df(joined)
    #joined = joined.cache()
    # store the data in the files
#     joined.coalesce(1).write.csv(f"{table}_dialy/data_{groupby_level}_{date_str}.csv",header=True, mode='overwrite')
    joined.write.parquet(f"{table}_dialy/data_{groupby_level}_{date_str}.parquet", mode='overwrite')
    
#     final_joined = final_joined.cache()
#     final_joined.coalesce(1).write.csv(f"{table}_dialy/aaum_data_{groupby_level}_{date_str}.csv",header=True, mode='overwrite')
#     final_joined.coalesce(1).write.parquet(f"{table}_dialy/aaum_data_{groupby_level}_{date_str}.parquet", mode='overwrite')
    
    
    # upload the data if needed
    # Done
    print (f'dialy file on date {date_str} generated')
    return day_count, combined_count

# dialy_job('2020-05-01', groupby_level='SPFT', table='Trans_116', direct_db='BankRecon', nav_table='fund_navreg_AXA_29072020',
#         scheme_table='Fund_Master_AXA_29072020', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
#          category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory'
#         )
# dialy_job('2020-06-01', groupby_level='SP', table='Trans_128', direct_db='BankRecon', nav_table='fund_navreg_axismf',
#         scheme_table='fund_master_axismf', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
#          category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory'
#         )
# dialy_job('2020-06-01', groupby_level='SP', table='Trans_120', direct_db='BankRecon', nav_table='fund_navreg_invesco',
#         scheme_table='fund_master_INVESCO', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
#          category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory'
#         )
# dialy_job('2020-06-01', groupby_level='SP', table='m_Trans_116', direct_db='BankRecon', nav_table='fund_navreg_AXA_29072020',
#         scheme_table='Fund_Master_AXA_29072020', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
#          category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
#         )

# dialy_job('2020-06-01', groupby_level='SP', table='m_Trans_117', direct_db='BankRecon', nav_table='fund_navreg_MIRAE',
#         scheme_table='fund_master_MIRAE', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
#          category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
#         )
# print ('done')
# import time
# for i,ele in enumerate(list(daterange('2020-05-01', '2020-06-02'))):
#     s = time.time()
#     dialy_job(ele, groupby_level='SPFT', table='m_Trans_117', direct_db='BankRecon', nav_table='fund_navreg_MIRAE',
#         scheme_table='fund_master_MIRAE', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
#          category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
#         )
#     generate_mcr_report(table=f'm_Trans_{code}', groupby_level='SP', start_date = '2020-05-02', end_date = '2020-06-02')

#     print (i, ele, time.time() - s)

# day_records, combined_records = dialy_job('2020-04-02', groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
#             scheme_table=f'fund_master_{name}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
#              category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
#             )

CPU times: user 12 µs, sys: 8 µs, total: 20 µs
Wall time: 26.2 µs


#### run script

In [11]:
#
table_codes = {117: 'MIRAE'}

table_codes = {"116": "AXA","117": "MIRAE","107": "BOB","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codes = {"117": "MIRAE","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codess ={130:"peerless", 120:"INVESCO", }

table_codes = {125: 'IBMF', 152: "ITI", 123: "Quantum",}

table_codes = {  107: "BOB",135: "IDBIMF", 178: "BNPMF", 103: "PMF"}

table_codes = { 103:"peerless", 118:"edelwwise"}
table_codes = {"129": "DLFPramerica","120": "INVESCO"}
table_codes = {"RMF": "Reliance" }
table_codes = {"116": "AXA"}
# table_codes = {"135": 'IDBIMF'}
# table_codes = {118:"edelwwise"}
# exception taurus
for code,name in (table_codes.items()):
    try:
        
        try:
            spark.catalog.clearCache()
        except:
            pass
#         try:
#             sc.stop()
#         except:

#             print ("error no sc")
#         # intialize spark again
#         conf = SparkConf()
#         conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

#         #Create spark context and sparksession
#         sc = SparkContext.getOrCreate(conf=conf)
#         SparkContext.setSystemProperty("spark.driver.memory", "60g")
#         SparkContext.setSystemProperty("spark.executor.memory", "60g")
#         SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
#         SparkContext.setSystemProperty("spark.executor.offHeap.size", "100g")
#         spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        trans_table = f'Trans_{code}'
        groupby_level='SPFTTer'
        init_date = '2020-03-31'
        start_date = '2020-04-01'
        end_date = '2020-09-02'
#         mcr_month_date = '2020-05-01'

        records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level, trans_table=trans_table)
        save_metric(init_date, 'records_processed', records, name, groupby_level, table)

        print (f'initialization time {time.time() - start}')
        save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level, table)
        job_start = time.time()


        for i,ele in enumerate(list(daterange(start_date, end_date))):
#             try:
#                 spark.catalog.clearCache()
#             except:
#                 pass
#             try:
#                 sc.stop()
#             except:

#                 print ("error no sc")
#             # intialize spark again
#             conf = SparkConf()
#             conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

#             #Create spark context and sparksession
#             sc = SparkContext.getOrCreate(conf=conf)
#             SparkContext.setSystemProperty("spark.driver.memory", "120g")
#             SparkContext.setSystemProperty("spark.executor.memory", "120g")
#             SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
#             SparkContext.setSystemProperty("spark.executor.offHeap.size", "100g")
#             spark = SparkSession(sc)
            
            s = time.time()
            day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
            scheme_table=f'fund_master_{name}_{code}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
             category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId',
                                                      trans_table=trans_table
            )
            save_metric(ele, 'day_records', day_records, name, groupby_level, table)
            save_metric(ele, 'combined_records', combined_records, name, groupby_level, table)
            save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level, table)
            print ("    ",i, ele, time.time() - s)

        s = time.time() 
#         generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-06-02', end_date = '2020-07-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

        print (f'job time is {time.time() - job_start}')
        print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))

116 AXA
inital file on date 2020-03-31 written
initialization time 31.5660400390625
dialy file on date 2020-04-01 generated
     0 2020-04-01 7.679014205932617
dialy file on date 2020-04-02 generated
     1 2020-04-02 7.41456937789917
dialy file on date 2020-04-03 generated
     2 2020-04-03 7.324365139007568
dialy file on date 2020-04-04 generated
     3 2020-04-04 7.335397243499756
dialy file on date 2020-04-05 generated
     4 2020-04-05 6.9578845500946045
dialy file on date 2020-04-06 generated
     5 2020-04-06 7.276837587356567
dialy file on date 2020-04-07 generated
     6 2020-04-07 6.981618165969849
dialy file on date 2020-04-08 generated
     7 2020-04-08 7.206529855728149
dialy file on date 2020-04-09 generated
     8 2020-04-09 6.952902317047119
dialy file on date 2020-04-10 generated
     9 2020-04-10 6.943164825439453
dialy file on date 2020-04-11 generated
     10 2020-04-11 7.116787433624268
dialy file on date 2020-04-12 generated
     11 2020-04-12 7.057420969009399
di

dialy file on date 2020-07-16 generated
     106 2020-07-16 9.930031299591064
dialy file on date 2020-07-17 generated
     107 2020-07-17 10.234426021575928
dialy file on date 2020-07-18 generated
     108 2020-07-18 8.966026067733765
dialy file on date 2020-07-19 generated
     109 2020-07-19 8.781954288482666
dialy file on date 2020-07-20 generated
     110 2020-07-20 11.008086681365967
dialy file on date 2020-07-21 generated
     111 2020-07-21 8.610520839691162
dialy file on date 2020-07-22 generated
     112 2020-07-22 8.047937393188477
dialy file on date 2020-07-23 generated
     113 2020-07-23 7.453668594360352
dialy file on date 2020-07-24 generated
     114 2020-07-24 7.515707492828369
dialy file on date 2020-07-25 generated
     115 2020-07-25 7.173721790313721
dialy file on date 2020-07-26 generated
     116 2020-07-26 7.5057373046875
dialy file on date 2020-07-27 generated
     117 2020-07-27 8.073629140853882
dialy file on date 2020-07-28 generated
     118 2020-07-28 7.99

In [12]:
#
table_codes = {117: 'MIRAE'}

table_codes = {"116": "AXA","117": "MIRAE","107": "BOB","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codes = {"117": "MIRAE","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codess ={130:"peerless", 120:"INVESCO", }

table_codes = {125: 'IBMF', 152: "ITI", 123: "Quantum",}

table_codes = {  107: "BOB",135: "IDBIMF", 178: "BNPMF", 103: "PMF"}

table_codes = { 103:"peerless", 118:"edelwwise"}
table_codes = {"129": "DLFPramerica","120": "INVESCO"}
table_codes = {"RMF": "Reliance" }
table_codes = {"116": "AXA"}
# table_codes = {"135": 'IDBIMF'}
table_codes = {"118":"edelwwise",
               "123": "Quantum",
               "166": "Quant",
               "107": "BOB",
               "178": "BNPMF",
               "101": "canrobeco",
               "102": "LIC",
               "103": "PMF", 
               "104": "TARUS", 
               "105": "JMMF",
               "117": "MIRAE",
               "127": "MOTILAL",
               "128": "axismf",
               "129": "DLFPramerica",
               "130":"peerless",
               "135": 'IDBIMF',
               "152": "ITI",
               "176": "sundaram",
               "108": "UTI",
               "RMF": "Reliance"
               
              }

table_codes = {"118":"edelwwise",
               "123": "Quantum",
               "166": "Quant",
               "107": "BOB",
               "178": "BNPMF",
               "101": "canrobeco",
               "102": "LIC",
               "103": "PMF", 
               "104": "TARUS", 
               "105": "JMMF",
               "117": "MIRAE",
               "127": "MOTILAL",
               "128": "axismf",
               "129": "DLFPramerica",
               "130":"peerless",
               "135": 'IDBIMF',
               "152": "ITI",
               "176": "sundaram",
               "108": "UTI",
               "RMF": "Reliance"
               
              }
# exception taurus
for code,name in (table_codes.items()):
    try:
        
        try:
            spark.catalog.clearCache()
        except:
            pass
#         try:
#             sc.stop()
#         except:

#             print ("error no sc")
#         # intialize spark again
#         conf = SparkConf()
#         conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

#         #Create spark context and sparksession
#         sc = SparkContext.getOrCreate(conf=conf)
#         SparkContext.setSystemProperty("spark.driver.memory", "60g")
#         SparkContext.setSystemProperty("spark.executor.memory", "60g")
#         SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
#         SparkContext.setSystemProperty("spark.executor.offHeap.size", "100g")
#         spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        trans_table = f'Trans_{code}'
        groupby_level='SPFTTer'
        init_date = '2020-03-31'
        start_date = '2020-04-01'
        end_date = '2020-09-02'
#         mcr_month_date = '2020-05-01'

        records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level, trans_table=trans_table)
        save_metric(init_date, 'records_processed', records, name, groupby_level, table)

        print (f'initialization time {time.time() - start}')
        save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level, table)
        job_start = time.time()


        for i,ele in enumerate(list(daterange(start_date, end_date))):
#             try:
#                 spark.catalog.clearCache()
#             except:
#                 pass
#             try:
#                 sc.stop()
#             except:

#                 print ("error no sc")
#             # intialize spark again
#             conf = SparkConf()
#             conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

#             #Create spark context and sparksession
#             sc = SparkContext.getOrCreate(conf=conf)
#             SparkContext.setSystemProperty("spark.driver.memory", "120g")
#             SparkContext.setSystemProperty("spark.executor.memory", "120g")
#             SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
#             SparkContext.setSystemProperty("spark.executor.offHeap.size", "100g")
#             spark = SparkSession(sc)
            
            s = time.time()
            day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
            scheme_table=f'fund_master_{name}_{code}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
             category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId',
                                                      trans_table=trans_table
            )
            save_metric(ele, 'day_records', day_records, name, groupby_level, table)
            save_metric(ele, 'combined_records', combined_records, name, groupby_level, table)
            save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level, table)
            print ("    ",i, ele, time.time() - s)

        s = time.time() 
#         generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-06-02', end_date = '2020-07-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

        print (f'job time is {time.time() - job_start}')
        print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))

118 edelwwise
inital file on date 2020-03-31 written
initialization time 58.57779145240784
dialy file on date 2020-04-01 generated
     0 2020-04-01 16.11402726173401
dialy file on date 2020-04-02 generated
     1 2020-04-02 16.577534198760986
dialy file on date 2020-04-03 generated
     2 2020-04-03 17.170456409454346
dialy file on date 2020-04-04 generated
     3 2020-04-04 16.486101150512695
dialy file on date 2020-04-05 generated
     4 2020-04-05 15.979234218597412
dialy file on date 2020-04-06 generated
     5 2020-04-06 15.691267013549805
dialy file on date 2020-04-07 generated
     6 2020-04-07 16.60265803337097
dialy file on date 2020-04-08 generated
     7 2020-04-08 16.073237657546997
dialy file on date 2020-04-09 generated
     8 2020-04-09 16.292750597000122
dialy file on date 2020-04-10 generated
     9 2020-04-10 16.477232456207275
dialy file on date 2020-04-11 generated
     10 2020-04-11 17.039254426956177
dialy file on date 2020-04-12 generated
     11 2020-04-12 16.3

dialy file on date 2020-07-15 generated
     105 2020-07-15 17.454596042633057
dialy file on date 2020-07-16 generated
     106 2020-07-16 16.670833587646484
dialy file on date 2020-07-17 generated
     107 2020-07-17 17.49170422554016
dialy file on date 2020-07-18 generated
     108 2020-07-18 17.002193212509155
dialy file on date 2020-07-19 generated
     109 2020-07-19 17.047162771224976
dialy file on date 2020-07-20 generated
     110 2020-07-20 17.11916756629944
dialy file on date 2020-07-21 generated
     111 2020-07-21 17.60499143600464
dialy file on date 2020-07-22 generated
     112 2020-07-22 19.041799783706665
dialy file on date 2020-07-23 generated
     113 2020-07-23 17.39967918395996
dialy file on date 2020-07-24 generated
     114 2020-07-24 18.197808742523193
dialy file on date 2020-07-25 generated
     115 2020-07-25 17.569475889205933
dialy file on date 2020-07-26 generated
     116 2020-07-26 17.462523937225342
dialy file on date 2020-07-27 generated
     117 2020-07

dialy file on date 2020-05-26 generated
     55 2020-05-26 5.932774066925049
dialy file on date 2020-05-27 generated
     56 2020-05-27 5.927213907241821
dialy file on date 2020-05-28 generated
     57 2020-05-28 6.356530666351318
dialy file on date 2020-05-29 generated
     58 2020-05-29 6.17162299156189
dialy file on date 2020-05-30 generated
     59 2020-05-30 6.159275054931641
dialy file on date 2020-05-31 generated
     60 2020-05-31 6.246100664138794
dialy file on date 2020-06-01 generated
     61 2020-06-01 6.237509727478027
dialy file on date 2020-06-02 generated
     62 2020-06-02 6.033574819564819
dialy file on date 2020-06-03 generated
     63 2020-06-03 6.160930156707764
dialy file on date 2020-06-04 generated
     64 2020-06-04 6.119116544723511
dialy file on date 2020-06-05 generated
     65 2020-06-05 6.140955924987793
dialy file on date 2020-06-06 generated
     66 2020-06-06 6.171853303909302
dialy file on date 2020-06-07 generated
     67 2020-06-07 6.209280252456665


dialy file on date 2020-04-06 generated
     5 2020-04-06 3.4036076068878174
dialy file on date 2020-04-07 generated
     6 2020-04-07 3.7725861072540283
dialy file on date 2020-04-08 generated
     7 2020-04-08 3.7693400382995605
dialy file on date 2020-04-09 generated
     8 2020-04-09 3.6616857051849365
dialy file on date 2020-04-10 generated
     9 2020-04-10 3.5405290126800537
dialy file on date 2020-04-11 generated
     10 2020-04-11 3.586204767227173
dialy file on date 2020-04-12 generated
     11 2020-04-12 3.452443838119507
dialy file on date 2020-04-13 generated
     12 2020-04-13 3.789506673812866
dialy file on date 2020-04-14 generated
     13 2020-04-14 3.4043567180633545
dialy file on date 2020-04-15 generated
     14 2020-04-15 3.5549747943878174
dialy file on date 2020-04-16 generated
     15 2020-04-16 3.8489298820495605
dialy file on date 2020-04-17 generated
     16 2020-04-17 3.7856764793395996
dialy file on date 2020-04-18 generated
     17 2020-04-18 3.46114373207

dialy file on date 2020-07-21 generated
     111 2020-07-21 3.9129865169525146
dialy file on date 2020-07-22 generated
     112 2020-07-22 3.857794761657715
dialy file on date 2020-07-23 generated
     113 2020-07-23 3.6416826248168945
dialy file on date 2020-07-24 generated
     114 2020-07-24 3.756145477294922
dialy file on date 2020-07-25 generated
     115 2020-07-25 3.698029041290283
dialy file on date 2020-07-26 generated
     116 2020-07-26 3.6895675659179688
dialy file on date 2020-07-27 generated
     117 2020-07-27 3.7637572288513184
dialy file on date 2020-07-28 generated
     118 2020-07-28 3.6559014320373535
dialy file on date 2020-07-29 generated
     119 2020-07-29 3.6291112899780273
dialy file on date 2020-07-30 generated
     120 2020-07-30 3.945053815841675
dialy file on date 2020-07-31 generated
     121 2020-07-31 3.612327814102173
dialy file on date 2020-08-01 generated
     122 2020-08-01 3.8151345252990723
dialy file on date 2020-08-02 generated
     123 2020-08-

dialy file on date 2020-06-01 generated
     61 2020-06-01 24.177584648132324
dialy file on date 2020-06-02 generated
     62 2020-06-02 24.724754810333252
dialy file on date 2020-06-03 generated
     63 2020-06-03 24.656277179718018
dialy file on date 2020-06-04 generated
     64 2020-06-04 24.401984453201294
dialy file on date 2020-06-05 generated
     65 2020-06-05 24.527921199798584
dialy file on date 2020-06-06 generated
     66 2020-06-06 23.45023775100708
dialy file on date 2020-06-07 generated
     67 2020-06-07 23.494258403778076
dialy file on date 2020-06-08 generated
     68 2020-06-08 24.110158920288086
dialy file on date 2020-06-09 generated
     69 2020-06-09 23.719205141067505
dialy file on date 2020-06-10 generated
     70 2020-06-10 23.69861602783203
dialy file on date 2020-06-11 generated
     71 2020-06-11 24.501177310943604
dialy file on date 2020-06-12 generated
     72 2020-06-12 23.707650899887085
dialy file on date 2020-06-13 generated
     73 2020-06-13 23.0330

dialy file on date 2020-04-11 generated
     10 2020-04-11 14.766610383987427
dialy file on date 2020-04-12 generated
     11 2020-04-12 15.322805404663086
dialy file on date 2020-04-13 generated
     12 2020-04-13 14.944795846939087
dialy file on date 2020-04-14 generated
     13 2020-04-14 15.737800359725952
dialy file on date 2020-04-15 generated
     14 2020-04-15 15.146265983581543
dialy file on date 2020-04-16 generated
     15 2020-04-16 15.49724268913269
dialy file on date 2020-04-17 generated
     16 2020-04-17 14.934743404388428
dialy file on date 2020-04-18 generated
     17 2020-04-18 15.194290399551392
dialy file on date 2020-04-19 generated
     18 2020-04-19 14.811918258666992
dialy file on date 2020-04-20 generated
     19 2020-04-20 15.662992000579834
dialy file on date 2020-04-21 generated
     20 2020-04-21 14.654221057891846
dialy file on date 2020-04-22 generated
     21 2020-04-22 14.941006183624268
dialy file on date 2020-04-23 generated
     22 2020-04-23 15.044

dialy file on date 2020-07-26 generated
     116 2020-07-26 16.254043340682983
dialy file on date 2020-07-27 generated
     117 2020-07-27 16.91286015510559
dialy file on date 2020-07-28 generated
     118 2020-07-28 17.44371509552002
dialy file on date 2020-07-29 generated
     119 2020-07-29 17.582271814346313
dialy file on date 2020-07-30 generated
     120 2020-07-30 16.642162084579468
dialy file on date 2020-07-31 generated
     121 2020-07-31 17.483134746551514
dialy file on date 2020-08-01 generated
     122 2020-08-01 16.894701719284058
dialy file on date 2020-08-02 generated
     123 2020-08-02 16.786571502685547
dialy file on date 2020-08-03 generated
     124 2020-08-03 16.85802698135376
dialy file on date 2020-08-04 generated
     125 2020-08-04 17.710792541503906
dialy file on date 2020-08-05 generated
     126 2020-08-05 17.82070302963257
dialy file on date 2020-08-06 generated
     127 2020-08-06 17.311411142349243
dialy file on date 2020-08-07 generated
     128 2020-08

dialy file on date 2020-06-06 generated
     66 2020-06-06 49.6543915271759
dialy file on date 2020-06-07 generated
     67 2020-06-07 48.39197492599487
dialy file on date 2020-06-08 generated
     68 2020-06-08 50.62093138694763
dialy file on date 2020-06-09 generated
     69 2020-06-09 53.77212119102478
dialy file on date 2020-06-10 generated
     70 2020-06-10 49.84801197052002
dialy file on date 2020-06-11 generated
     71 2020-06-11 50.47632575035095
dialy file on date 2020-06-12 generated
     72 2020-06-12 48.92024898529053
dialy file on date 2020-06-13 generated
     73 2020-06-13 48.55868983268738
dialy file on date 2020-06-14 generated
     74 2020-06-14 48.44344663619995
dialy file on date 2020-06-15 generated
     75 2020-06-15 60.140472412109375
dialy file on date 2020-06-16 generated
     76 2020-06-16 70.09696841239929
dialy file on date 2020-06-17 generated
     77 2020-06-17 55.933141231536865
dialy file on date 2020-06-18 generated
     78 2020-06-18 50.0357933044433

dialy file on date 2020-04-17 generated
     16 2020-04-17 36.130682706832886
dialy file on date 2020-04-18 generated
     17 2020-04-18 40.00591826438904
dialy file on date 2020-04-19 generated
     18 2020-04-19 41.50200295448303
dialy file on date 2020-04-20 generated
     19 2020-04-20 40.6082820892334
dialy file on date 2020-04-21 generated
     20 2020-04-21 40.383572578430176
dialy file on date 2020-04-22 generated
     21 2020-04-22 37.9630651473999
dialy file on date 2020-04-23 generated
     22 2020-04-23 36.742077112197876
dialy file on date 2020-04-24 generated
     23 2020-04-24 43.84343409538269
dialy file on date 2020-04-25 generated
     24 2020-04-25 37.10824537277222
dialy file on date 2020-04-26 generated
     25 2020-04-26 37.93812441825867
dialy file on date 2020-04-27 generated
     26 2020-04-27 38.07548713684082
dialy file on date 2020-04-28 generated
     27 2020-04-28 36.252429723739624
dialy file on date 2020-04-29 generated
     28 2020-04-29 36.986613273620

dialy file on date 2020-08-01 generated
     122 2020-08-01 40.89841651916504
dialy file on date 2020-08-02 generated
     123 2020-08-02 40.921995639801025
dialy file on date 2020-08-03 generated
     124 2020-08-03 39.14393877983093
dialy file on date 2020-08-04 generated
     125 2020-08-04 40.85584998130798
dialy file on date 2020-08-05 generated
     126 2020-08-05 40.39070391654968
dialy file on date 2020-08-06 generated
     127 2020-08-06 40.27091598510742
dialy file on date 2020-08-07 generated
     128 2020-08-07 40.2489972114563
dialy file on date 2020-08-08 generated
     129 2020-08-08 38.727835178375244
dialy file on date 2020-08-09 generated
     130 2020-08-09 38.76491618156433
dialy file on date 2020-08-10 generated
     131 2020-08-10 39.53615403175354
dialy file on date 2020-08-11 generated
     132 2020-08-11 39.528624296188354
dialy file on date 2020-08-12 generated
     133 2020-08-12 37.666497230529785
dialy file on date 2020-08-13 generated
     134 2020-08-13 3

dialy file on date 2020-06-12 generated
     72 2020-06-12 46.356116771698
dialy file on date 2020-06-13 generated
     73 2020-06-13 46.709291219711304
dialy file on date 2020-06-14 generated
     74 2020-06-14 47.34081506729126
dialy file on date 2020-06-15 generated
     75 2020-06-15 45.70147705078125
dialy file on date 2020-06-16 generated
     76 2020-06-16 50.97766852378845
dialy file on date 2020-06-17 generated
     77 2020-06-17 46.07951521873474
dialy file on date 2020-06-18 generated
     78 2020-06-18 45.90027666091919
dialy file on date 2020-06-19 generated
     79 2020-06-19 49.746397733688354
dialy file on date 2020-06-20 generated
     80 2020-06-20 48.58522844314575
dialy file on date 2020-06-21 generated
     81 2020-06-21 47.21811890602112
dialy file on date 2020-06-22 generated
     82 2020-06-22 48.4951434135437
dialy file on date 2020-06-23 generated
     83 2020-06-23 47.63024878501892
dialy file on date 2020-06-24 generated
     84 2020-06-24 49.49799728393555


dialy file on date 2020-04-23 generated
     22 2020-04-23 10.810330152511597
dialy file on date 2020-04-24 generated
     23 2020-04-24 11.053090572357178
dialy file on date 2020-04-25 generated
     24 2020-04-25 11.561141729354858
dialy file on date 2020-04-26 generated
     25 2020-04-26 12.07591700553894
dialy file on date 2020-04-27 generated
     26 2020-04-27 10.900059700012207
dialy file on date 2020-04-28 generated
     27 2020-04-28 11.147131204605103
dialy file on date 2020-04-29 generated
     28 2020-04-29 10.993346929550171
dialy file on date 2020-04-30 generated
     29 2020-04-30 10.96900486946106
dialy file on date 2020-05-01 generated
     30 2020-05-01 10.798906087875366
dialy file on date 2020-05-02 generated
     31 2020-05-02 10.888997077941895
dialy file on date 2020-05-03 generated
     32 2020-05-03 10.920975923538208
dialy file on date 2020-05-04 generated
     33 2020-05-04 11.042552471160889
dialy file on date 2020-05-05 generated
     34 2020-05-05 10.8365

dialy file on date 2020-08-06 generated
     127 2020-08-06 11.697989702224731
dialy file on date 2020-08-07 generated
     128 2020-08-07 11.298259258270264
dialy file on date 2020-08-08 generated
     129 2020-08-08 11.464832305908203
dialy file on date 2020-08-09 generated
     130 2020-08-09 11.08907437324524
dialy file on date 2020-08-10 generated
     131 2020-08-10 11.388930559158325
dialy file on date 2020-08-11 generated
     132 2020-08-11 11.388759851455688
dialy file on date 2020-08-12 generated
     133 2020-08-12 10.907841444015503
dialy file on date 2020-08-13 generated
     134 2020-08-13 11.08750605583191
dialy file on date 2020-08-14 generated
     135 2020-08-14 11.176470756530762
dialy file on date 2020-08-15 generated
     136 2020-08-15 11.572775602340698
dialy file on date 2020-08-16 generated
     137 2020-08-16 11.4170823097229
dialy file on date 2020-08-17 generated
     138 2020-08-17 11.284567594528198
dialy file on date 2020-08-18 generated
     139 2020-08

dialy file on date 2020-06-17 generated
     77 2020-06-17 20.298523426055908
dialy file on date 2020-06-18 generated
     78 2020-06-18 21.235716342926025
dialy file on date 2020-06-19 generated
     79 2020-06-19 22.783830404281616
dialy file on date 2020-06-20 generated
     80 2020-06-20 20.574815273284912
dialy file on date 2020-06-21 generated
     81 2020-06-21 21.10754704475403
dialy file on date 2020-06-22 generated
     82 2020-06-22 20.298103094100952
dialy file on date 2020-06-23 generated
     83 2020-06-23 20.655550479888916
dialy file on date 2020-06-24 generated
     84 2020-06-24 21.429747819900513
dialy file on date 2020-06-25 generated
     85 2020-06-25 21.86226749420166
dialy file on date 2020-06-26 generated
     86 2020-06-26 20.4074764251709
dialy file on date 2020-06-27 generated
     87 2020-06-27 20.16528558731079
dialy file on date 2020-06-28 generated
     88 2020-06-28 30.689152479171753
dialy file on date 2020-06-29 generated
     89 2020-06-29 24.6199469

dialy file on date 2020-04-28 generated
     27 2020-04-28 51.94450092315674
dialy file on date 2020-04-29 generated
     28 2020-04-29 51.161044120788574
dialy file on date 2020-04-30 generated
     29 2020-04-30 53.56552529335022
dialy file on date 2020-05-01 generated
     30 2020-05-01 53.09965109825134
dialy file on date 2020-05-02 generated
     31 2020-05-02 53.886170387268066
dialy file on date 2020-05-03 generated
     32 2020-05-03 54.44037485122681
dialy file on date 2020-05-04 generated
     33 2020-05-04 52.71370720863342
dialy file on date 2020-05-05 generated
     34 2020-05-05 56.7148072719574
dialy file on date 2020-05-06 generated
     35 2020-05-06 54.604223012924194
dialy file on date 2020-05-07 generated
     36 2020-05-07 55.50386452674866
dialy file on date 2020-05-08 generated
     37 2020-05-08 55.521838426589966
dialy file on date 2020-05-09 generated
     38 2020-05-09 55.234758377075195
dialy file on date 2020-05-10 generated
     39 2020-05-10 56.2783322334

dialy file on date 2020-08-12 generated
     133 2020-08-12 59.28875780105591
dialy file on date 2020-08-13 generated
     134 2020-08-13 58.36991477012634
dialy file on date 2020-08-14 generated
     135 2020-08-14 58.33308005332947
dialy file on date 2020-08-15 generated
     136 2020-08-15 57.93738627433777
dialy file on date 2020-08-16 generated
     137 2020-08-16 57.17828869819641
dialy file on date 2020-08-17 generated
     138 2020-08-17 60.9385507106781
dialy file on date 2020-08-18 generated
     139 2020-08-18 58.95348262786865
dialy file on date 2020-08-19 generated
     140 2020-08-19 57.74036002159119
dialy file on date 2020-08-20 generated
     141 2020-08-20 59.1501362323761
dialy file on date 2020-08-21 generated
     142 2020-08-21 59.97798442840576
dialy file on date 2020-08-22 generated
     143 2020-08-22 58.732592821121216
dialy file on date 2020-08-23 generated
     144 2020-08-23 57.956770181655884
dialy file on date 2020-08-24 generated
     145 2020-08-24 59.1

dialy file on date 2020-06-23 generated
     83 2020-06-23 36.162426233291626
dialy file on date 2020-06-24 generated
     84 2020-06-24 35.813870906829834
dialy file on date 2020-06-25 generated
     85 2020-06-25 35.48742389678955
dialy file on date 2020-06-26 generated
     86 2020-06-26 36.71299958229065
dialy file on date 2020-06-27 generated
     87 2020-06-27 35.635557889938354
dialy file on date 2020-06-28 generated
     88 2020-06-28 36.07337713241577
dialy file on date 2020-06-29 generated
     89 2020-06-29 35.6765673160553
dialy file on date 2020-06-30 generated
     90 2020-06-30 37.96899104118347
dialy file on date 2020-07-01 generated
     91 2020-07-01 38.11522555351257
dialy file on date 2020-07-02 generated
     92 2020-07-02 36.58059597015381
dialy file on date 2020-07-03 generated
     93 2020-07-03 36.07649755477905
dialy file on date 2020-07-04 generated
     94 2020-07-04 37.13877272605896
dialy file on date 2020-07-05 generated
     95 2020-07-05 36.532114982604

dialy file on date 2020-05-04 generated
     33 2020-05-04 134.0414969921112
dialy file on date 2020-05-05 generated
     34 2020-05-05 143.4903209209442
dialy file on date 2020-05-06 generated
     35 2020-05-06 140.58675384521484
dialy file on date 2020-05-07 generated
     36 2020-05-07 137.86493587493896
dialy file on date 2020-05-08 generated
     37 2020-05-08 138.8653917312622
dialy file on date 2020-05-09 generated
     38 2020-05-09 136.52324509620667
dialy file on date 2020-05-10 generated
     39 2020-05-10 136.81586718559265
dialy file on date 2020-05-11 generated
     40 2020-05-11 140.78224158287048
dialy file on date 2020-05-12 generated
     41 2020-05-12 146.45783948898315
dialy file on date 2020-05-13 generated
     42 2020-05-13 140.79679775238037
dialy file on date 2020-05-14 generated
     43 2020-05-14 137.8601839542389
dialy file on date 2020-05-15 generated
     44 2020-05-15 140.24291229248047
dialy file on date 2020-05-16 generated
     45 2020-05-16 138.87130

dialy file on date 2020-08-17 generated
     138 2020-08-17 152.2723090648651
dialy file on date 2020-08-18 generated
     139 2020-08-18 153.53703117370605
dialy file on date 2020-08-19 generated
     140 2020-08-19 155.931086063385
dialy file on date 2020-08-20 generated
     141 2020-08-20 153.91746139526367
dialy file on date 2020-08-21 generated
     142 2020-08-21 156.65658140182495
dialy file on date 2020-08-22 generated
     143 2020-08-22 148.21158361434937
dialy file on date 2020-08-23 generated
     144 2020-08-23 149.9366238117218
dialy file on date 2020-08-24 generated
     145 2020-08-24 153.41924786567688
dialy file on date 2020-08-25 generated
     146 2020-08-25 153.73771381378174
dialy file on date 2020-08-26 generated
     147 2020-08-26 153.45827794075012
dialy file on date 2020-08-27 generated
     148 2020-08-27 152.69021463394165
dialy file on date 2020-08-28 generated
     149 2020-08-28 153.34597373008728
dialy file on date 2020-08-29 generated
     150 2020-08

dialy file on date 2020-06-28 generated
     88 2020-06-28 12.206747770309448
dialy file on date 2020-06-29 generated
     89 2020-06-29 12.331880807876587
dialy file on date 2020-06-30 generated
     90 2020-06-30 12.174222946166992
dialy file on date 2020-07-01 generated
     91 2020-07-01 12.685553789138794
dialy file on date 2020-07-02 generated
     92 2020-07-02 11.800331354141235
dialy file on date 2020-07-03 generated
     93 2020-07-03 12.680219173431396
dialy file on date 2020-07-04 generated
     94 2020-07-04 11.915649175643921
dialy file on date 2020-07-05 generated
     95 2020-07-05 12.280686140060425
dialy file on date 2020-07-06 generated
     96 2020-07-06 11.951626777648926
dialy file on date 2020-07-07 generated
     97 2020-07-07 12.307072162628174
dialy file on date 2020-07-08 generated
     98 2020-07-08 11.916674375534058
dialy file on date 2020-07-09 generated
     99 2020-07-09 12.251845836639404
dialy file on date 2020-07-10 generated
     100 2020-07-10 12.1

Traceback (most recent call last):
  File "<ipython-input-12-d31d41b31207>", line 116, in <module>
    day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
  File "<timed exec>", line 254, in dialy_job
  File "<ipython-input-2-9f27faa36581>", line 17, in read_df
    data = spark.read.format("jdbc") \
  File "/usr/local/spark/python/pyspark/sql/readwriter.py", line 184, in load
    return self._df(self._jreader.load())
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
    return_value = get_return_value(
  File "/usr/local/spark/python/pyspark/sql/utils.py", line 131, in deco
    return f(*a, **kw)
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o849066.load.
: com.microsoft.sqlserver.jdbc.SQLServerExcept

None
An error occurred while calling o849066.load.
: com.microsoft.sqlserver.jdbc.SQLServerException: Invalid object name 'fund_navreg_peerless'.
	at com.microsoft.sqlserver.jdbc.SQLServerException.makeFromDatabaseError(SQLServerException.java:262)
	at com.microsoft.sqlserver.jdbc.SQLServerStatement.getNextResult(SQLServerStatement.java:1621)
	at com.microsoft.sqlserver.jdbc.SQLServerPreparedStatement.doExecutePreparedStatement(SQLServerPreparedStatement.java:592)
	at com.microsoft.sqlserver.jdbc.SQLServerPreparedStatement$PrepStmtExecCmd.doExecute(SQLServerPreparedStatement.java:522)
	at com.microsoft.sqlserver.jdbc.TDSCommand.execute(IOBuffer.java:7194)
	at com.microsoft.sqlserver.jdbc.SQLServerConnection.executeCommand(SQLServerConnection.java:2935)
	at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeCommand(SQLServerStatement.java:248)
	at com.microsoft.sqlserver.jdbc.SQLServerStatement.executeStatement(SQLServerStatement.java:223)
	at com.microsoft.sqlserver.jdbc.SQLServerP

dialy file on date 2020-06-13 generated
     73 2020-06-13 12.869708061218262
dialy file on date 2020-06-14 generated
     74 2020-06-14 12.30665373802185
dialy file on date 2020-06-15 generated
     75 2020-06-15 12.988982677459717
dialy file on date 2020-06-16 generated
     76 2020-06-16 12.75277590751648
dialy file on date 2020-06-17 generated
     77 2020-06-17 13.054303884506226
dialy file on date 2020-06-18 generated
     78 2020-06-18 12.302404403686523
dialy file on date 2020-06-19 generated
     79 2020-06-19 12.78466010093689
dialy file on date 2020-06-20 generated
     80 2020-06-20 12.187839269638062
dialy file on date 2020-06-21 generated
     81 2020-06-21 12.219683647155762
dialy file on date 2020-06-22 generated
     82 2020-06-22 13.093921422958374
dialy file on date 2020-06-23 generated
     83 2020-06-23 11.877617597579956
dialy file on date 2020-06-24 generated
     84 2020-06-24 12.275954008102417
dialy file on date 2020-06-25 generated
     85 2020-06-25 12.66310

dialy file on date 2020-04-23 generated
     22 2020-04-23 2.326270341873169
dialy file on date 2020-04-24 generated
     23 2020-04-24 2.330885410308838
dialy file on date 2020-04-25 generated
     24 2020-04-25 2.4462459087371826
dialy file on date 2020-04-26 generated
     25 2020-04-26 2.288780689239502
dialy file on date 2020-04-27 generated
     26 2020-04-27 2.606597661972046
dialy file on date 2020-04-28 generated
     27 2020-04-28 2.3768701553344727
dialy file on date 2020-04-29 generated
     28 2020-04-29 2.3302600383758545
dialy file on date 2020-04-30 generated
     29 2020-04-30 2.4481751918792725
dialy file on date 2020-05-01 generated
     30 2020-05-01 2.4501521587371826
dialy file on date 2020-05-02 generated
     31 2020-05-02 2.265615463256836
dialy file on date 2020-05-03 generated
     32 2020-05-03 2.532522678375244
dialy file on date 2020-05-04 generated
     33 2020-05-04 2.44598126411438
dialy file on date 2020-05-05 generated
     34 2020-05-05 2.47834372520

dialy file on date 2020-08-07 generated
     128 2020-08-07 2.7034575939178467
dialy file on date 2020-08-08 generated
     129 2020-08-08 2.5183801651000977
dialy file on date 2020-08-09 generated
     130 2020-08-09 2.582014799118042
dialy file on date 2020-08-10 generated
     131 2020-08-10 2.503967523574829
dialy file on date 2020-08-11 generated
     132 2020-08-11 2.500678539276123
dialy file on date 2020-08-12 generated
     133 2020-08-12 2.5124526023864746
dialy file on date 2020-08-13 generated
     134 2020-08-13 2.5841174125671387
dialy file on date 2020-08-14 generated
     135 2020-08-14 2.6135175228118896
dialy file on date 2020-08-15 generated
     136 2020-08-15 2.7059383392333984
dialy file on date 2020-08-16 generated
     137 2020-08-16 2.517534017562866
dialy file on date 2020-08-17 generated
     138 2020-08-17 2.5291476249694824
dialy file on date 2020-08-18 generated
     139 2020-08-18 2.8722076416015625
dialy file on date 2020-08-19 generated
     140 2020-08

dialy file on date 2020-06-18 generated
     78 2020-06-18 107.61250019073486
dialy file on date 2020-06-19 generated
     79 2020-06-19 107.95628213882446
dialy file on date 2020-06-20 generated
     80 2020-06-20 109.24843621253967
dialy file on date 2020-06-21 generated
     81 2020-06-21 106.86705994606018
dialy file on date 2020-06-22 generated
     82 2020-06-22 104.5609347820282
dialy file on date 2020-06-23 generated
     83 2020-06-23 107.2444396018982
dialy file on date 2020-06-24 generated
     84 2020-06-24 121.85166144371033
dialy file on date 2020-06-25 generated
     85 2020-06-25 108.45218920707703
dialy file on date 2020-06-26 generated
     86 2020-06-26 107.95366263389587
dialy file on date 2020-06-27 generated
     87 2020-06-27 109.15008354187012
dialy file on date 2020-06-28 generated
     88 2020-06-28 105.66043663024902
dialy file on date 2020-06-29 generated
     89 2020-06-29 112.88473701477051
dialy file on date 2020-06-30 generated
     90 2020-06-30 154.107

dialy file on date 2020-04-28 generated
     27 2020-04-28 407.4757056236267
dialy file on date 2020-04-29 generated
     28 2020-04-29 404.0962610244751
dialy file on date 2020-04-30 generated
     29 2020-04-30 445.4690887928009
dialy file on date 2020-05-01 generated
     30 2020-05-01 414.1233205795288
dialy file on date 2020-05-02 generated
     31 2020-05-02 405.7663142681122
dialy file on date 2020-05-03 generated
     32 2020-05-03 407.6781768798828
dialy file on date 2020-05-04 generated
     33 2020-05-04 405.1052703857422
dialy file on date 2020-05-05 generated
     34 2020-05-05 412.67921900749207
dialy file on date 2020-05-06 generated
     35 2020-05-06 410.7419514656067
dialy file on date 2020-05-07 generated
     36 2020-05-07 406.0498433113098
dialy file on date 2020-05-08 generated
     37 2020-05-08 405.4929111003876
dialy file on date 2020-05-09 generated
     38 2020-05-09 408.5446755886078
dialy file on date 2020-05-10 generated
     39 2020-05-10 406.068838119506

Traceback (most recent call last):
  File "<ipython-input-12-d31d41b31207>", line 116, in <module>
    day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
  File "<timed exec>", line 327, in dialy_job
  File "/usr/local/spark/python/pyspark/sql/readwriter.py", line 936, in parquet
    self._jwrite.parquet(path)
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
    return_value = get_return_value(
  File "/usr/local/spark/python/pyspark/sql/utils.py", line 131, in deco
    return f(*a, **kw)
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o1047007.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:226)
	at

None
An error occurred while calling o1047007.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:226)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:178)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:108)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:106)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:131)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:175)
	at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:213)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:210)
	at org.apache.spa

RMF Reliance
inital file on date 2020-03-31 written
initialization time 12870.234615802765
dialy file on date 2020-04-01 generated
     0 2020-04-01 538.5814113616943
dialy file on date 2020-04-02 generated
     1 2020-04-02 529.9933423995972
dialy file on date 2020-04-03 generated
     2 2020-04-03 563.3022892475128
dialy file on date 2020-04-04 generated
     3 2020-04-04 547.118485212326
dialy file on date 2020-04-05 generated
     4 2020-04-05 546.2764575481415
dialy file on date 2020-04-06 generated
     5 2020-04-06 589.289694070816
dialy file on date 2020-04-07 generated
     6 2020-04-07 589.4057207107544
dialy file on date 2020-04-08 generated
     7 2020-04-08 568.8850433826447
dialy file on date 2020-04-09 generated
     8 2020-04-09 1185.0641913414001
dialy file on date 2020-04-10 generated
     9 2020-04-10 552.7119889259338
dialy file on date 2020-04-11 generated
     10 2020-04-11 553.01766705513
dialy file on date 2020-04-12 generated
     11 2020-04-12 548.653573989868

Traceback (most recent call last):
  File "<ipython-input-12-d31d41b31207>", line 116, in <module>
    day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
  File "<timed exec>", line 250, in dialy_job
  File "/usr/local/spark/python/pyspark/sql/readwriter.py", line 936, in parquet
    self._jwrite.parquet(path)
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py", line 1304, in __call__
    return_value = get_return_value(
  File "/usr/local/spark/python/pyspark/sql/utils.py", line 131, in deco
    return f(*a, **kw)
  File "/usr/local/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py", line 326, in get_return_value
    raise Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o1061275.parquet.
: org.apache.spark.SparkException: Job aborted.
	at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:226)
	at

In [None]:
#
table_codes = {117: 'MIRAE'}

table_codes = {"116": "AXA","117": "MIRAE","107": "BOB","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codes = {"117": "MIRAE","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codess ={130:"peerless", 120:"INVESCO", }

table_codes = {125: 'IBMF', 152: "ITI", 123: "Quantum",}

table_codes = {  107: "BOB",135: "IDBIMF", 178: "BNPMF", 103: "PMF"}

table_codes = { 103:"peerless", 118:"edelwwise"}
table_codes = {"129": "DLFPramerica","120": "INVESCO"}
table_codes = {"RMF": "Reliance" }
# table_codes = {"116": "AXA"}
# exception taurus
for code,name in (table_codes.items()):
    try:
        
        try:
            spark.catalog.clearCache()
        except:
            pass
        try:
            sc.stop()
        except:

            print ("error no sc")
        # intialize spark again
        conf = SparkConf()
        conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

        #Create spark context and sparksession
        sc = SparkContext.getOrCreate(conf=conf)
        SparkContext.setSystemProperty("spark.driver.memory", "60g")
        SparkContext.setSystemProperty("spark.executor.memory", "60g")
        SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
        SparkContext.setSystemProperty("spark.executor.offHeap.size", "100g")
        spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        groupby_level='SPFTTer'
        init_date = '2020-03-31'
        start_date = '2020-04-07'
        end_date = '2020-09-02'
#         mcr_month_date = '2020-05-01'

#         records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level)
#         save_metric(init_date, 'records_processed', records, name, groupby_level, table)

        print (f'initialization time {time.time() - start}')
#         save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level, table)
        job_start = time.time()


        for i,ele in enumerate(list(daterange(start_date, end_date))):
            try:
                spark.catalog.clearCache()
            except:
                pass
            try:
                sc.stop()
            except:

                print ("error no sc")
            # intialize spark again
            conf = SparkConf()
            conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

            #Create spark context and sparksession
            sc = SparkContext.getOrCreate(conf=conf)
            SparkContext.setSystemProperty("spark.driver.memory", "120g")
            SparkContext.setSystemProperty("spark.executor.memory", "120g")
            SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
            SparkContext.setSystemProperty("spark.executor.offHeap.size", "100g")
            spark = SparkSession(sc)
            
            s = time.time()
            day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
            scheme_table=f'fund_master_{name}_{code}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
             category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
            )
            save_metric(ele, 'day_records', day_records, name, groupby_level, table)
            save_metric(ele, 'combined_records', combined_records, name, groupby_level, table)
            save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level, table)
            print ("    ",i, ele, time.time() - s)

        s = time.time() 
#         generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-06-02', end_date = '2020-07-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

        print (f'job time is {time.time() - job_start}')
        print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))

RMF Reliance
initialization time 4.76837158203125e-06


In [None]:
#
table_codes = {117: 'MIRAE'}

table_codes = {"116": "AXA","117": "MIRAE","107": "BOB","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codes = {"117": "MIRAE","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codess ={130:"peerless", 120:"INVESCO", }

table_codes = {125: 'IBMF', 152: "ITI", 123: "Quantum",}

table_codes = {  107: "BOB",135: "IDBIMF", 178: "BNPMF", 103: "PMF"}

table_codes = { 103:"peerless", 118:"edelwwise"}
table_codes = {"129": "DLFPramerica","120": "INVESCO"}
table_codes = {"RMF": "Reliance" }
table_codes = {"116": "AXA"}
# exception taurus
for code,name in (table_codes.items()):
    try:
        
#         try:
#             spark.catalog.clearCache()
#         except:
#             pass
#         try:
#             sc.stop()
#         except:

#             print ("error no sc")
#         # intialize spark again
#         conf = SparkConf()
#         conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

#         #Create spark context and sparksession
#         sc = SparkContext.getOrCreate(conf=conf)
#         SparkContext.setSystemProperty("spark.driver.memory", "60g")
#         SparkContext.setSystemProperty("spark.executor.memory", "60g")
#         SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
#         SparkContext.setSystemProperty("spark.executor.offHeap.size", "100g")
#         spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        groupby_level='SPFTTer'
        init_date = '2020-03-31'
        start_date = '2020-04-01'
        end_date = '2020-09-02'
#         mcr_month_date = '2020-05-01'

        records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level)
        #save_metric(init_date, 'records_processed', records, name, groupby_level, table)

        print (f'initialization time {time.time() - start}')
#         save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level)
        job_start = time.time()


        for i,ele in enumerate(list(daterange(start_date, end_date))):
#             try:
#                 spark.catalog.clearCache()
#             except:
#                 pass
#             try:
#                 sc.stop()
#             except:

#                 print ("error no sc")
#             # intialize spark again
#             conf = SparkConf()
#             conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

#             #Create spark context and sparksession
#             sc = SparkContext.getOrCreate(conf=conf)
#             SparkContext.setSystemProperty("spark.driver.memory", "120g")
#             SparkContext.setSystemProperty("spark.executor.memory", "120g")
#             SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
#             SparkContext.setSystemProperty("spark.executor.offHeap.size", "100g")
#             spark = SparkSession(sc)
            
            s = time.time()
            day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
            scheme_table=f'fund_master_{name}_{code}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
             category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
            )
            #save_metric(ele, 'day_records', day_records, name, groupby_level, table)
            #save_metric(ele, 'combined_records', combined_records, name, groupby_level, table)
            #save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level, table)
            print ("    ",i, ele, time.time() - s)

        s = time.time() 
#         generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-06-02', end_date = '2020-07-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

        print (f'job time is {time.time() - job_start}')
        print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))

In [None]:
#
table_codes = {117: 'MIRAE'}

table_codes = {"116": "AXA","117": "MIRAE","107": "BOB","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codes = {"117": "MIRAE","120": "INVESCO","RMF": "Reliance",
"118": "Edelweiss","135": "IDBIMF","125": "IBMF","128": "AXISMF","178": "BNPMF","152": "ITI",
"105": "JMMF","103": "PMF","166": "Quant","130": "PeerlessMF","104": "TAURUS","108": "UTI",
"123": "Quantum","127": "MOTILAL","102": "LIC","176": "SundaramMF","101": "canrobeco","129": "DLFPramerica"}

table_codes = {125: 'IBMF', 152: "ITI", 123: "Quantum",}
table_codes = {125:'IBMF', 104:'TARUS', 103:"peerless", 123:"Quantum", 118:"edelwwise"}

table_codes = {"RMF": "Reliance" }

# exception taurus
for code,name in (table_codes.items()):
    try:
        
        try:
            spark.catalog.clearCache()
        except:
            pass
        try:
            sc.stop()
        except:

            print ("error no sc")
        # intialize spark again
        conf = SparkConf()
        conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

        #Create spark context and sparksession
        
        SparkContext.setSystemProperty("spark.driver.memory", "60g")
        SparkContext.setSystemProperty("spark.executor.memory", "60g")
        SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
        SparkContext.setSystemProperty("spark.executor.offHeap.size", "200g")
        sc = SparkContext.getOrCreate(conf=conf)
        spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        groupby_level='SPFT'
        init_date = '2020-03-31'
        start_date = '2020-06-05'
        end_date = '2020-08-02'
#         mcr_month_date = '2020-05-01'

        #records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level)
        #save_metric(init_date, 'records_processed', records, name, groupby_level, table)

        print (f'initialization time {time.time() - start}')
#         save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level)
        job_start = time.time()


        for i,ele in enumerate(list(daterange(start_date, end_date))):
            try:
                spark.catalog.clearCache()
            except:
                pass
            try:
                sc.stop()
            except:

                print ("error no sc")
            # intialize spark again
            conf = SparkConf()
            conf.setMaster("local[*]").setAppName("My app").set("spark.sql.shuffle.partitions", 16)

            #Create spark context and sparksession
            
            SparkContext.setSystemProperty("spark.driver.memory", "60g")
            SparkContext.setSystemProperty("spark.executor.memory", "60g")
            SparkContext.setSystemProperty("spark.executor.offHeap.enabled", "true")
            SparkContext.setSystemProperty("spark.executor.offHeap.size", "200g")
            sc = SparkContext.getOrCreate(conf=conf)
            spark = SparkSession(sc)
            
            s = time.time()
            day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
            scheme_table=f'fund_master_{name}_{code}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
             category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
            )
            save_metric(ele, 'day_records', day_records, name, groupby_level, table)
            save_metric(ele, 'combined_records', combined_records, name, groupby_level, table)
            save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level, table)
            print ("    ",i, ele, time.time() - s)

        s = time.time() 
#         generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-06-02', end_date = '2020-07-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

        print (f'job time is {time.time() - job_start}')
        print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))

In [None]:
#
init_date = '2020-04-01'
groupby_level = 'SPFT'
table = 'trans116'
direct_db = 'kfintech_funds'
nav_table = 'nav_master'
scheme_table = 'scheme_master'

init_date = '2020-04-01'
groupby_level = 'SPFT'
table = 'trans116'
direct_db = 'kfintech_funds'
nav_table = 'nav_master'
scheme_table = 'scheme_master'

initialize(init_date, table=table,direct_db=direct_db, groupby_level=groupby_level)
# init_date = '2020-04-30'
# initialize(init_date, table=table,direct_db=direct_db, groupby_level=groupby_level)


for ele in daterange('2020-04-02', '2020-04-03'):
    dialy_job(ele, groupby_level=groupby_level, table=table, direct_db=direct_db, nav_table=nav_table,
                scheme_table=scheme_table, scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
                 category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
                )

In [7]:
for ele in daterange('2020-04-02', '2020-09-02'):
    latest_data = spark.read.parquet(f"{'m_Trans_116'}_dialy/data_{'SPFTTer'}_{str(ele)}.parquet")
    #latest_data_ = spark.read.parquet(f"{'m_Trans_116'}_dialy/data_{'SPFT'}_{'2020-09-01'}.parquet")
    latest_data.coalesce(1).write.csv(f'axa_interm_{str(ele)}.csv', header=True,mode='overwrite')
    newmcrid='fm_NewMCRId'
    latest_data.groupby(newmcrid).agg(sum('aum')).show()
    #latest_data_.groupby(newmcrid).agg(sum('aum')).show()

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.090663479991921E8|
|     AII_25|2.9306776153193027E8|
|     AII_26|3.2976802943876057E9|
|      AI_12| 9.150751967743661E8|
|     AII_21| 4.221463189625897E8|
|       AI_1| 5.297272265669848E8|
|    AIII_31|3.1419857077378345E8|
|    AIII_28|    2.605810246407E9|
|       AI_3|2.5172344301652713E9|
|       null|                null|
|     AII_19|1.3122280863362782E9|
|       AI_2| 2.739935609478163E9|
|       AI_6|3.9159494123020905E8|
|    AIII_27| 9.267321143614126E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.090531097947762E8|
|     AII_25|2.9314765675284016E8|
|     AII_26|3.3007451288131957E9|
|      AI_12| 9.150751967743661E8|
|     AII_21|4.2380349142398983E8|
|       AI_1| 4.292916404816728E8|
|    AIII_31|3.1419907077100194E8|
|    AIII_28|2.6066

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.412604608697269E8|
|     AII_25| 3.127282364496515E8|
|     AII_26| 3.578127772006825E9|
|      AI_12| 8.294595134203612E8|
|     AII_21| 4.611453638075306E8|
|       AI_1| 5.396626303238904E8|
|    AIII_31| 3.849530386710377E8|
|    AIII_28| 2.793890570502157E9|
|       AI_3|2.6428914324685116E9|
|       null|                null|
|     AII_19|1.4411038486050186E9|
|       AI_2|  3.13260948370187E9|
|       AI_6|3.8775552149885446E8|
|    AIII_27| 9.360968150270883E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.367163810252122E8|
|     AII_25|3.1545055350082034E8|
|     AII_26|3.5693255708932323E9|
|      AI_12| 8.279889415546759E8|
|     AII_21|4.6011547904192144E8|
|       AI_1| 5.054416774532205E8|
|    AIII_31|4.0512922124151534E8|
|    AIII_28| 2.789

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.595597458764623E8|
|     AII_25| 3.266777164042611E8|
|     AII_26| 3.720417687029728E9|
|      AI_12|3.5816538528733295E8|
|     AII_21| 4.994416748929802E8|
|       AI_1| 7.872456856156824E8|
|    AIII_31| 3.962552063046683E8|
|    AIII_28|2.8399007106664042E9|
|       AI_3|2.3228871538520894E9|
|       null|                null|
|     AII_19|1.4905419746190305E9|
|       AI_2|3.0507891815070305E9|
|       AI_6|3.4711822315607876E8|
|    AIII_27| 9.344746590479003E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29|  9.64319059411785E8|
|     AII_25| 3.274121914640812E8|
|     AII_26|3.7358308807385545E9|
|      AI_12| 3.547545224455096E8|
|     AII_21| 5.056742449153013E8|
|       AI_1| 9.427116787222402E8|
|    AIII_31|3.9582532874351406E8|
|    AIII_28| 2.853

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.540479101333413E8|
|     AII_25|3.2282671684125876E8|
|     AII_26| 3.673267077989673E9|
|      AI_12|3.5445227253854614E8|
|     AII_21| 5.016248264929994E8|
|       AI_1| 7.016928454958446E8|
|    AIII_31|3.4409825812687075E8|
|    AIII_28|2.8233274775108476E9|
|       AI_3| 2.058047589659751E9|
|       null|                null|
|     AII_19|1.4683543237429402E9|
|       AI_2|3.3541303645901613E9|
|       AI_6| 3.486448051460925E8|
|    AIII_27| 9.314920081675315E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.515574775151187E8|
|     AII_25| 3.237141032301297E8|
|     AII_26|3.6719349782983584E9|
|      AI_12| 3.546044958912556E8|
|     AII_21| 5.026208172454294E8|
|       AI_1| 6.383009012880161E8|
|    AIII_31| 3.341052467608452E8|
|    AIII_28| 2.824

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.365498454846438E8|
|     AII_25| 3.187424732297195E8|
|     AII_26|3.5811578929242544E9|
|      AI_12|3.5352309933206916E8|
|     AII_21| 5.058825734445494E8|
|       AI_1|4.9035837285950464E8|
|    AIII_31|3.1131726324493706E8|
|    AIII_28| 2.773774252412489E9|
|       AI_3|2.0612920848556767E9|
|       null|                null|
|     AII_19|1.4459783114469688E9|
|       AI_2| 2.967399983285642E9|
|       AI_6| 3.456097813141617E8|
|    AIII_27| 9.281274575016661E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.365498454846438E8|
|     AII_25| 3.187424732297195E8|
|     AII_26|3.5811578929242544E9|
|      AI_12|3.5352309933206916E8|
|     AII_21| 5.058825734445494E8|
|       AI_1| 4.903956498391596E8|
|    AIII_31|3.1131726324493706E8|
|    AIII_28| 2.773

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.792980415501541E8|
|     AII_25| 3.499224694617106E8|
|     AII_26|3.8497217875817213E9|
|      AI_12| 3.526412037833805E8|
|     AII_21| 5.497668593225596E8|
|       AI_1| 7.920269528924615E8|
|    AIII_31|3.0857215092259663E8|
|    AIII_28|2.9570051407172365E9|
|       AI_3|2.1275370075978422E9|
|       null|                null|
|     AII_19|1.5680178130155342E9|
|       AI_2|3.2308500951933947E9|
|       AI_6| 3.537674455212252E8|
|    AIII_27| 9.323556549018711E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.792980415501541E8|
|     AII_25| 3.499224694617106E8|
|     AII_26|3.8497217875817213E9|
|      AI_12| 3.526412037833805E8|
|     AII_21| 5.497668593225596E8|
|       AI_1| 7.920951566931348E8|
|    AIII_31|3.0857215092259663E8|
|    AIII_28|2.9570

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.472945568304744E8|
|     AII_25| 3.470717572567286E8|
|     AII_26|3.8357626821938076E9|
|      AI_12|3.5393917368130714E8|
|     AII_21| 5.522323146024201E8|
|       AI_1| 2.160403347924065E8|
|    AIII_31| 2.676039865181468E8|
|    AIII_28|2.8952615015786147E9|
|       AI_3|2.0247556889014711E9|
|       null|                null|
|     AII_19| 1.546926413779743E9|
|       AI_2| 3.621784291894884E9|
|       AI_6| 3.447604025663303E8|
|    AIII_27| 8.979382314938042E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.535882380018387E8|
|     AII_25| 3.520064146509508E8|
|     AII_26| 3.874714000619565E9|
|      AI_12|3.5657160188865733E8|
|     AII_21| 5.597448469641314E8|
|       AI_1| 2.145710367382182E8|
|    AIII_31|2.5992615621541065E8|
|    AIII_28| 2.915

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.432062338267301E8|
|     AII_25| 3.552536985803396E8|
|     AII_26|3.9171574931067195E9|
|      AI_12| 3.572638016274923E8|
|     AII_21|  5.59880950418448E8|
|       AI_1| 1.762647312830936E8|
|    AIII_31| 2.447908569355703E8|
|    AIII_28|  2.91853447909393E9|
|       AI_3|2.0477990010939293E9|
|       null|                null|
|     AII_19|1.5884467918827398E9|
|     AII_17|3.2628620323279965E8|
|       AI_2|3.6002283607558317E9|
|       AI_6|3.4041578503505814E8|
|    AIII_27| 8.819988536387525E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29|  9.45705378567845E8|
|     AII_25|  3.58016397644298E8|
|     AII_26| 3.941655747438157E9|
|      AI_12| 3.573884006138134E8|
|     AII_21| 5.645096751587497E8|
|       AI_1|   1.9130043127154E8|
|    AIII_31| 2.448

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29|  9.38947530733058E8|
|     AII_25|3.5911217759760886E8|
|     AII_26|4.0160877543277855E9|
|      AI_12|3.6167493800527686E8|
|     AII_21|   5.8267653950144E8|
|       AI_1| 3.826322510619863E8|
|    AIII_31| 1.903861666778705E8|
|    AIII_28| 2.973213236455973E9|
|       AI_3|2.2438213595891457E9|
|       null|                null|
|     AII_19|1.6296602388530295E9|
|     AII_17| 3.385783096408798E8|
|       AI_2|3.0905660521808014E9|
|       AI_6| 3.404161271919608E8|
|    AIII_27| 8.773305092218428E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.367241063364998E8|
|     AII_25| 3.581024542583478E8|
|     AII_26|4.0104968492630644E9|
|      AI_12| 3.622756237274641E8|
|     AII_21| 5.849446279012591E8|
|       AI_1|  2.00540948525144E8|
|    AIII_31|1.9038

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.388314154580823E8|
|     AII_25|3.6241209219520867E8|
|     AII_26|4.1779509354911675E9|
|      AI_12| 3.616664184968568E8|
|     AII_21| 6.015234826861795E8|
|       AI_1| 2.429395581143463E8|
|    AIII_31|1.4672308582948762E8|
|    AIII_28| 3.012860527028137E9|
|       AI_3|2.3702391500300817E9|
|       null|                null|
|     AII_19|1.6697503574993317E9|
|     AII_17|3.7053605820998037E8|
|       AI_2|2.7181286033065305E9|
|       AI_6| 3.367017071710635E8|
|    AIII_27| 8.789582710823071E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29|  9.41814019180051E8|
|     AII_25|3.6379196616628945E8|
|     AII_26|4.1989339642884684E9|
|      AI_12| 3.616567973218938E8|
|     AII_21| 6.030140304393102E8|
|       AI_1| 3.794956656754185E8|
|    AIII_31| 1.706

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.390392682588977E8|
|     AII_25|3.8132221720654875E8|
|     AII_26| 4.396107839018476E9|
|      AI_12| 3.732179380799523E8|
|     AII_21| 6.459857168278614E8|
|       AI_1| 9.490970241718771E8|
|    AIII_31| 2.969084319095681E8|
|    AIII_28| 3.144247081230616E9|
|       AI_3| 2.432336757361619E9|
|       null|                null|
|     AII_19|1.7169093352120767E9|
|     AII_17| 3.837758447123002E8|
|       AI_2| 2.524160120298331E9|
|       AI_6| 3.341926701105955E8|
|    AIII_27|  8.91736456577643E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.387256814817151E8|
|     AII_25| 3.864829495368996E8|
|     AII_26| 4.464154161919717E9|
|      AI_12| 3.734213950944874E8|
|     AII_21| 6.548703614468405E8|
|       AI_1| 9.473643010272952E8|
|    AIII_31| 2.985

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.353497376763109E8|
|     AII_25|4.0208821194482875E8|
|     AII_26| 4.536099564526437E9|
|      AI_12|3.7347615453367716E8|
|     AII_21| 7.127737983183995E8|
|       AI_1| 7.284072033389264E8|
|    AIII_31|2.9796514757897675E8|
|    AIII_28|3.2616052685663357E9|
|       AI_3|2.2311021879040856E9|
|       null|                null|
|     AII_19|1.7420109330360372E9|
|     AII_17| 4.018989169149303E8|
|       AI_2| 2.400968794281738E9|
|       AI_6| 3.315809035070261E8|
|    AIII_27| 8.794665542068099E8|
+-----------+--------------------+

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.353497376763109E8|
|     AII_25|4.0208821194482875E8|
|     AII_26| 4.536099564526437E9|
|      AI_12|3.7347615453367716E8|
|     AII_21| 7.127737983183995E8|
|       AI_1| 5.582467291911533E8|
|    AIII_31|2.9796

In [55]:
llatest_data = spark.read.parquet(f"{'m_Trans_116'}_dialy/data_{'SPFTTer'}_{'2020-09-01'}.parquet")
llatest_data.groupby(newmcrid).agg(sum('aum')).show()

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 9.175777190211585E8|
|     AII_25|3.7345910768308026E8|
|     AII_26| 4.441361186929758E9|
|      AI_12| 3.734928158315652E8|
|     AII_21| 7.246340465773715E8|
|       AI_1| 6.771047297224059E8|
|    AIII_31| 3.005910526386833E8|
|    AIII_28| 3.176874591605646E9|
|       AI_3| 2.529782207578232E9|
|       null|                null|
|     AII_19| 1.711511444015332E9|
|     AII_17|   4.0515296491536E8|
|       AI_2| 2.331079486276221E9|
|       AI_6|3.2928791216110605E8|
|    AIII_27| 8.683341086366962E8|
+-----------+--------------------+



In [56]:
blatest_data = spark.read.parquet(f"{'m_Trans_116'}_dialy/data_{'SPFTBTer'}_{'2020-09-01'}.parquet")
blatest_data.groupby(newmcrid).agg(sum('aum')).show()

+-----------+--------------------+
|fm_NewMCRId|            sum(aum)|
+-----------+--------------------+
|    AIII_29| 7.691624059313657E8|
|     AII_25| -1.23559474364506E9|
|     AII_26|4.1307369858156133E9|
|      AI_12|-1.98018786825585...|
|     AII_21| 7.586508808656338E8|
|       AI_1| 6.771047297224052E8|
|    AIII_31|2.8746542855506495E7|
|    AIII_28| 3.038452141457967E9|
|       AI_3|-1.17023161735102...|
|       null|                null|
|     AII_19|-3.06165551048722...|
|     AII_17|4.0515296491536033E8|
|       AI_2|-7.02685621425754...|
|       AI_6|-1.24080189235766...|
|    AIII_27| 6.237194375081455E8|
+-----------+--------------------+



In [None]:
def get_data(start_date, end_date, groupby_level='SPT', table='m_Trans_116'):
    
    final_data = None

    dates_list = list(daterange(start_date, end_date))
    for date in dates_list:
        date_obj = datetime.datetime.strptime(date, '%Y-%m-%d')
        day_num = date_obj.day
        latest_data = spark.read.parquet(f"{table}_dialy/data_{groupby_level}_{date}.parquet")
        if final_data:
            final_data = final_data.union(latest_data)
        else:
            final_data = latest_data
    return final_data

def generate_mcr_report(table='m_Trans_116', ignored_tr_types = ['CNI', 'CNO', 'TRMI', 'TRMO', 
                                                                 'TRFI', 'TRFO', 'PLDO', 'UPLO', 'DMT',
                                                                 'RMT', 'CNIR', 'CNOR', 'TRMIR', 'TRMOR',
                                                                 'TRFIR', 'TRFOR', 'PLDOR', 'UPLOR', 'DMTR', 'RMTR'],
        liquid_fund_tr_types = ['NEW', 'ADD', 'IPO', 'SIN', 'NEWR', 'ADDR', 'IPOR', 'SINR'],
                       start_date = '2020-05-02', end_date = '2020-06-02', groupby_level='SPT',
                        transaction_type='TransactionType',folio='Folio',folio_ignore_types = ['PLDO', 'UPLO', 'DMT', 'RMT', 'PLDOR', 'UPLOR', 'DMTR', 'RMTR'], 
                      fn_nav = 'fn_nav', newmcrid='fm_NewMCRId', today_pu = 'today_pu', today_ru = 'today_ru', scheme='SchemeCode', aum='aum', plan='PlanCode'):
    
    till_but_one_day_data = None
    inflow = 'inflow'
    outflow = 'outflow'
    calculated_date = 'calculated_date'
    batch_close_date = 'batch_close_date'
    balance_pu = 'balance_pu'
    balance_ru = 'balance_ru'
    balance_units = 'balance_units'
    
    
    dates_list = list(daterange(start_date, end_date))
    
    for date in dates_list[:-1]:
        date_obj = datetime.datetime.strptime(date, '%Y-%m-%d')
        day_num = date_obj.day
        latest_data = spark.read.parquet(f"{table}_dialy/data_{groupby_level}_{date}.parquet")
#         print (ele, latest_data.count())
        if till_but_one_day_data:
            till_but_one_day_data = till_but_one_day_data.union(latest_data)
        else:
            till_but_one_day_data = latest_data
            
    
    date_obj = datetime.datetime.strptime(date, '%Y-%m-%d')
    final_day = date_obj + datetime.timedelta(1)
    final_day_str = final_day.strftime('%Y-%m-%d')
    final_day_data = spark.read.parquet(f"{table}_dialy/data_{groupby_level}_{final_day_str}.parquet")
    last_but_one_day_data = latest_data

    till_but_one_day_data = till_but_one_day_data.filter( ~(col(transaction_type).isin(ignored_tr_types)) ).fillna({today_pu: 0, today_ru: 0, aum: 0, newmcrid: 'Others'})
    final_day_data = final_day_data.filter( ~(col(transaction_type).isin(ignored_tr_types)) ).fillna({today_pu: 0, today_ru: 0, aum: 0, newmcrid: 'Others'})
    
    
    all_data = till_but_one_day_data.union(final_day_data)
    sp_inf_ouf_data = all_data

    liquid_condition = ( (col(newmcrid) == 'A1b') & (col(calculated_date) == final_day_str) & (col(batch_close_date) == final_day_str) & (col(transaction_type).isin(liquid_fund_tr_types)) )
    
    final_day_data = final_day_data.withColumn(balance_pu,    when(liquid_condition, col(balance_pu) - col(today_pu)).otherwise(col(balance_pu)))
    final_day_data = final_day_data.withColumn(balance_ru,    when(liquid_condition, col(balance_ru) - col(today_ru)).otherwise(col(balance_ru)))
    final_day_data = final_day_data.withColumn(balance_units, when(liquid_condition, col(balance_pu) - col(balance_ru)).otherwise(col(balance_units)) )
    final_day_data = final_day_data.withColumn(aum,           when(liquid_condition,  col(balance_units) * col(fn_nav)).otherwise(col(aum))    )

    net_aum = final_day_data.groupby([newmcrid]).agg(sum(aum))
    avg_data = till_but_one_day_data.union(final_day_data)
#     avg_data = all_data
    
    # inflow, outflow logic change
    sp_data = avg_data
    # sp_inf_ouf_data = sp_inf_ouf_data.withColumn(inflow, (col(today_pu)*col(fn_nav)))
    # sp_inf_ouf_data = sp_inf_ouf_data.withColumn(outflow, (col(today_ru)*col(fn_nav)))
    sp = sp_data.groupby([newmcrid]).agg(countDistinct(scheme),countDistinct(plan))
    
#     inf_ouf_data = get_data(datetime.datetime(date_obj.year, 4, 2).strftime('%Y-%m-%d'), start_date, groupby_level, table).union(sp_data)
#     inf_ouf_data = inf_ouf_data.withColumn(inflow, (col(today_pu)*col(fn_nav)))
#     inf_ouf_data = inf_ouf_data.withColumn(outflow, (col(today_ru)*col(fn_nav)))
#     inf_ouf_data = inf_ouf_data.groupby([newmcrid]).agg(sum(col(inflow)),sum(col(outflow)))
    
#     spinout = sp.join(inf_ouf_data, on=[newmcrid], how='left')
#     spinout.show()
    spinout = sp
    # sp_inf_ouf_data = sp_inf_ouf_data.withColumn(inflow, (col(today_pu)*col(fn_nav)))
    # sp_inf_ouf_data = sp_inf_ouf_data.withColumn(outflow, (col(today_ru)*col(fn_nav)))

    folio_count = avg_data.groupby(folio, scheme, plan, newmcrid).agg(sum('aum')).filter(col('sum(aum)') - 0 > 0.1).groupby(newmcrid).agg(countDistinct(folio))

    avg_aum = avg_data.groupby([newmcrid]).agg(sum(aum))
    avg_aum = avg_aum.withColumn('avg_aum', col(f'sum({aum})')/(len(list(daterange(start_date, end_date))))).drop(f'sum({aum})')
    

    mcr_net_aum = spinout.join(net_aum, on=[newmcrid], how='left')
    mcr = mcr_net_aum.join(avg_aum, on=[newmcrid], how='left')
    mcr = mcr.join(folio_count, on=[newmcrid], how='left')
    
    # inflow outflow
    all_data = get_data('2020-04-02', end_date, groupby_level=groupby_level, table=table)
    all_data = all_data.filter(~liquid_condition)
    all_data = all_data.fillna({today_pu: 0, today_ru: 0, aum: 0, newmcrid: 'Others', inflow:0, outflow:0})
#     all_data = all_data.filter( ~(col(transaction_type).isin(ignored_tr_types)) ).fillna({today_pu: 0, today_ru: 0, aum: 0, newmcrid: 'Others', inflow:0, outflow:0})
    inout = all_data.groupby([newmcrid]).agg(sum(inflow), sum(outflow))
    mcr = mcr.join(inout, on=[newmcrid], how='left')
    
#     inout.show(1000)
    mcr.show(1000)
    
    mcr.coalesce(1).write.csv(f"{table}_mcr/mcr_{groupby_level}_{final_day_str}.csv",header=True, mode='overwrite')
#     mcr.coalesce(1).write.parquet(f"{table}_mcr/data_{groupby_level}_{date_str}.parquet", mode='overwrite')
    
    
    return mcr,avg_data, sp_inf_ouf_data, all_data
 
    
table = 'trans116'
table = 'm_Trans_116'
groupby_level = 'SPFT'
start_date = '2020-04-02'
end_date = '2020-05-02'
mcr, _,_,_ = generate_mcr_report(table=table, groupby_level=groupby_level, start_date=start_date, end_date=end_date)




In [None]:
table_codes = {117: 'MIRAE'}







# exception taurus
for code,name in (table_codes.items()):
    try:
        
        try:
            spark.catalog.clearCache()
        except:
            pass
        try:
            sc.stop()
        except:

            print ("error no sc")
        # intialize spark again
        conf = SparkConf()
        conf.setMaster("local[*]").setAppName("My app")

        #Create spark context and sparksession
        sc = SparkContext.getOrCreate(conf=conf)
        SparkContext.setSystemProperty("spark.driver.memory", "40g")
        spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        groupby_level='SPT'
        init_date = '2020-06-30'
        mcr_month_date = '2020-05-01'

        records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level)
#         save_metric(init_date, 'records_processed', records, name, groupby_level)

        print (f'initialization time {time.time() - start}')
#         save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level)
        job_start = time.time()


        for i,ele in enumerate(list(daterange('2020-07-01', '2020-08-02'))):
            s = time.time()
            day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
            scheme_table=f'fund_master_{name}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
             category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
            )
#             save_metric(ele, 'day_records', day_records, name, groupby_level)
#             save_metric(ele, 'combined_records', combined_records, name, groupby_level)
#             save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level)
            print ("    ",i, ele, time.time() - s)

        s = time.time() 
#         generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-06-02', end_date = '2020-07-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

        print (f'job time is {time.time() - job_start}')
        print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))

In [None]:
table_codes = {'RMF': 'Reliance'}

for code,name in (table_codes.items()):
    try:
        
        try:
            spark.catalog.clearCache()
        except:
            pass
        try:
            sc.stop()
        except:

            print ("error no sc")
        # intialize spark again
        conf = SparkConf()
        conf.setMaster("local[*]").setAppName("My app")

        #Create spark context and sparksession
        sc = SparkContext.getOrCreate(conf=conf)
        SparkContext.setSystemProperty("spark.driver.memory", "40g")
        spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        groupby_level='SPT'
        init_date = '2020-06-30'
        mcr_month_date = '2020-05-01'

        records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level)
#         save_metric(init_date, 'records_processed', records, name, groupby_level)

        print (f'initialization time {time.time() - start}')
#         save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level)
        job_start = time.time()


        for i,ele in enumerate(list(daterange('2020-07-01', '2020-08-02'))):
            s = time.time()
            day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
            scheme_table=f'fund_master_{name}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
             category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
            )
#             save_metric(ele, 'day_records', day_records, name, groupby_level)
#             save_metric(ele, 'combined_records', combined_records, name, groupby_level)
#             save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level)
            print ("    ",i, ele, time.time() - s)

        s = time.time() 
#         generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-06-02', end_date = '2020-07-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

        print (f'job time is {time.time() - job_start}')
        print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))

In [None]:
import datetime
# datetime.datetime.year('2020-08-04')

date_obj = datetime.datetime.strptime('2020-08-05', '%Y-%m-%d')
date_obj.year

datetime.datetime(date_obj.year, 4, 2).strftime('%Y-%m-%d')

In [None]:
{"116": "AXA",
"117": "MIRAE",
"107": "BOB",
"120": "INVESCO",
"RMF": "Reliance",
"118": "Edelweiss",
"135": "IDBIMF",
"125": "IBMF",
"128": "AXISMF",
"178": "BNPMF",
"152": "ITI",
"105": "JMMF",
"103": "PMF",
"166": "Quant",
"130": "PeerlessMF",
"104": "TAURUS",
"108": "UTI",
"123": "Quantum",
"127": "MOTILAL",
"102": "LIC",
"176": "SundaramMF",
"101": "canrobeco",
"129": "DLFPramerica"}

In [None]:
#### all the script exaaecution for all the funds
table_codes = {
               102:'LIC',103:'pmf',104:'tarus',105:'JMMF',107:'BOB',108:'uti',
               116:'AXA',117:'mirae',118:'edelwwise',120:'invesco',123:'quantum'
               ,125:'IBMF',127:'motilal',128:'axismf',130:'peerless',135:'IDBIMF',152:'ITI',166:'quant',
               176:'sundaram',178:'BNPMF', 'RMF':'reliance'}
table_codes = {104:'taurus',105:'JMMF',107:'BOB',108:'uti',
               116:'AXA',117:'mirae',118:'edelwwise',120:'invesco',123:'quantum'
               ,125:'IBMF',127:'motilal',128:'axismf',130:'peerless',135:'IDBIMF',152:'ITI',166:'quant',
               176:'sundaram',178:'BNPMF', 'RMF':'reliance'}

table_codes = {101:'canrobeco',
               102:'LIC',103:'pmf',104:'tarus',105:'JMMF',107:'BOB',108:'uti',
               116:'AXA',117:'mirae',118:'edelwwise',120:'invesco',123:'quantum'
               ,125:'IBMF',127:'motilal',128:'axismf',129:'pgim',130:'peerless',135:'IDBIMF',152:'ITI',166:'quant'
               ,178:'BNPMF', 'RMF':'reliance', '129': 'dlfpramerica', 118:'edelweiss', 130:'peerlessMF', '176':'sundaramMF'}
table_codes = {116: 'AXA'}

# exception taurus
for code,name in (table_codes.items()):
    try:
        
        try:
            spark.catalog.clearCache()
        except:
            pass
        try:
            sc.stop()
        except:

            print ("error no sc")
        # intialize spark again
        conf = SparkConf()
        conf.setMaster("local[*]").setAppName("My app")

        #Create spark context and sparksession
        sc = SparkContext.getOrCreate(conf=conf)
        SparkContext.setSystemProperty("spark.driver.memory", "40g")
        spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        groupby_level='SPFT'
        init_date = '2020-04-30'
        mcr_month_date = '2020-05-01'

        records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level)
#         save_metric(init_date, 'records_processed', records, name, groupby_level)

        print (f'initialization time {time.time() - start}')
#         save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level)
        job_start = time.time()


        for i,ele in enumerate(list(daterange('2020-05-01', '2020-06-02'))):
            s = time.time()
            day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
            scheme_table=f'fund_master_{name}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
             category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
            )
            save_metric(ele, 'day_records', day_records, name, groupby_level)
            save_metric(ele, 'combined_records', combined_records, name, groupby_level)
            save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level)
            print ("    ",i, ele, time.time() - s)

        s = time.time() 
#         generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-06-02', end_date = '2020-07-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

        print (f'job time is {time.time() - job_start}')
        print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))


In [None]:
#### generate mcr for august for all the funds
# table_codes = { '129': 'dlfpramerica', 118:'edelweiss', 130:'peerlessMF', '176':'sundaramMF'}

table_codes = {101:'canrobeco',
               102:'LIC',103:'pmf',104:'tarus',105:'JMMF',107:'BOB',108:'uti',
               116:'AXA',117:'mirae',118:'edelwwise',120:'invesco',123:'quantum'
               ,125:'IBMF',127:'motilal',128:'axismf',129:'pgim',130:'peerless',135:'IDBIMF',152:'ITI',166:'quant'
               ,178:'BNPMF', 'RMF':'reliance', '129': 'dlfpramerica', 118:'edelweiss', 130:'peerlessMF', '176':'sundaramMF'}
# exception taurus
for code,name in (table_codes.items()):
    try:
        
        try:
            spark.catalog.clearCache()
        except:
            pass
        try:
            sc.stop()
        except:

            print ("error no sc")
        # intialize spark again
        conf = SparkConf()
        conf.setMaster("local[*]").setAppName("My app")

        #Create spark context and sparksession
        sc = SparkContext.getOrCreate(conf=conf)
        SparkContext.setSystemProperty("spark.driver.memory", "40g")
        spark = SparkSession(sc)




        print (code, name)
        start = time.time()
        table = f'm_Trans_{code}'
        groupby_level='SPT'
        init_date = '2020-07-01'
        mcr_month_date = '2020-07-01'

#         records = initialize(init_date, table=table,direct_db='BankRecon', groupby_level=groupby_level)
#         save_metric(init_date, 'records_processed', records, name, groupby_level)

#         print (f'initialization time {time.time() - start}')
#         save_metric(init_date, 'intialization_time', time.time() - start, name, groupby_level)
#         job_start = time.time()


#         for i,ele in enumerate(list(daterange('2020-07-02', '2020-08-02'))):
#             s = time.time()
#             day_records, combined_records = dialy_job(ele, groupby_level=groupby_level, table=table, direct_db='BankRecon', nav_table=f'fund_navreg_{name}',
#             scheme_table=f'fund_master_{name}', scheme_code='fm_scheme', plan_code='fm_plan', nature='fm_nature',
#              category = 'fm_SebiSchemeCategory',subcategory = 'fm_SebiSchemeSubCategory', newmcrid='fm_NewMCRId'
#             )
#             save_metric(ele, 'day_records', day_records, name, groupby_level)
#             save_metric(ele, 'combined_records', combined_records, name, groupby_level)
#             save_metric(ele, 'dialy_job_time', time.time() - s, name, groupby_level)
#             print ("    ",i, ele, time.time() - s)

#         s = time.time() 
        generate_mcr_report(table=table, groupby_level=groupby_level, start_date = '2020-07-02', end_date = '2020-08-02')
#         save_metric(mcr_month_date, 'mcr_generate_time', time.time() - s, name, groupby_level)

#         print (f'job time is {time.time() - job_start}')
#         print (f'overall time is {time.time() - start}')
#         save_metric(mcr_month_date, 'overall_time', time.time() - start, name, groupby_level)

        print ()
    except Exception as e:
        print (traceback.print_exc())
        print (str(e))
