In [1]:
import os
import sys
curruser = os.environ.get('USER')

_labdata = os.environ.get("LABDATA_PYSPARK")
sys.path.insert(0, _labdata)
os.chdir(_labdata)

if curruser in os.listdir("/opt/workspace/"):
    sys.path.insert(0, '/opt/workspace/{user}/notebooks/support_library/'.format(user=curruser))
    sys.path.insert(0, '/opt/workspace/{user}/libs/python3.5/site-packages/'.format(user=curruser))
    # sys.path.insert(0, '/opt/workspace/{user}/notebooks/labdata_v1.2/lib/'.format(user=curruser))
else:
    sys.path.insert(0, '/home/{}/notebooks/support_library/'.format(curruser))
    sys.path.insert(0, '/home/{}/python35-libs/lib/python3.5/site-packages/'.format(curruser))
    # sys.path.insert(0, '/home/{}/notebooks/labdata/lib/'.format(curruser))

#import tendo.singleton
import warnings
warnings.filterwarnings('ignore')

import joblib
import json
from joblib import Parallel, delayed

from time import sleep
from itertools import islice
from multiprocessing import Pool, Process, JoinableQueue
from multiprocessing.pool import ThreadPool
from functools import partial
import subprocess
from threading import Thread
import time
from datetime import datetime as dt

from transliterate import translit

from lib.spark_connector import SparkConnector
from lib.sparkdb_loader import *
from lib.connector import OracleDB
import pyspark
from pyspark import SparkContext, SparkConf, HiveContext
from pyspark.sql.window import Window
from pyspark.sql.functions import *
import pyspark.sql.functions as f
from pyspark.sql.types import *
from pyspark.sql.dataframe import DataFrame
from pyspark.sql.utils import AnalysisException

import re
import pandas as pd
import numpy as np
from tqdm._tqdm_notebook import tqdm_notebook
from pathlib import Path
import shutil
import loader as load
from collections import ChainMap

from lib.config import *
from lib.tools import *

# sing = tendo.singleton.SingleInstance()

# os.chdir('/opt/workspace/ektov/notebooks/Clickstream_Analytics/AutoUpdate/')
# os.chdir('/opt/workspace/{}/notebooks/clickstream/AutoUpdate/'.format(curruser))

def show(self, n=10):
    return self.limit(n).toPandas()

def typed_udf(return_type):
    '''Make a UDF decorator with the given return type'''

    def _typed_udf_wrapper(func):
        return f.udf(func,return_type)

    return _typed_udf_wrapper

pyspark.sql.dataframe.DataFrame.show = show

def print_and_log(message: str):
    print(message)
    logger.info(message)
    return None


## Generate New Kerberos Ticket from PreSaved User Pass

In [3]:
!PASS=$(cat /home/$(whoami)/pass/userpswrd | sed 's/\r//g'); kdestroy && echo $PASS | kinit

Password for ektov1-av_ca-sbrf-ru@DF.SBRF.RU: 


In [4]:
sp = spark(schema=CONN_SCHEMA,
               dynamic_alloc=False,
               numofinstances=5,
               numofcores=8,
               executor_memory='15g',
               driver_memory='15g',
               kerberos_auth=False,
               process_label="_CHECK_HDFS_"
               )

hive = sp.sql
print(sp.sc.version)

# __init__ : begin
2.4.0.cloudera2


In [8]:
CONN_SCHEMA = 'sbx_t_team_cvm' #'sbx_team_digitcamp' #'sbx_t_team_cvm'

In [6]:
hive.setConf("hive.exec.dynamic.partition","true")
hive.setConf("hive.exec.dynamic.partition.mode","nonstrict")
hive.setConf("hive.enforce.bucketing","false")
hive.setConf("hive.enforce.sorting","false")
# hive.setConf("hive.exec.stagingdir", "/tmp/{}/".format(curruser))
# hive.setConf("hive.exec.scratchdir", "/tmp/{}/".format(curruser))
hive.setConf("hive.load.dynamic.partitions.thread", 1)

## Following command generates output log file after running dfs -du command follwing with human-like sorting by size columns

In [None]:
!hdfs dfs -du -s -h hdfs://clsklsbx/user/team/team_cvm/hive/* | awk '{print $1$2, $3$4, $5}' | sort -h -r -k2 >> ~/hdfs_consump_log

## Generate statistics using `describe extended` command repetitively

In [9]:
def table_size(hdfs_path):
    try:
        p = subprocess.Popen(['hdfs', 'dfs', '-du', '-s', hdfs_path], stdout=subprocess.PIPE, stdin=subprocess.PIPE)
        res= re.sub("\s+"," ",p.communicate()[0].decode('utf-8').rsplit(" ",1)[0]).strip().split(" ")
        size, size_fr = ["{:10.4f} Gb".format(ele) for ele in np.array(list(map(float, res)))/1.0e9]
    except ValueError:
        size, size_fr = [None]*2
    return size, size_fr

In [None]:
hdfs_dct={'OWNER':[],
          'HDFS_PATH':[],
          'SIZE':[],
          'SIZE_FR':[]}

dbs_row = hive.sql("show tables in {}".format(CONN_SCHEMA)).collect()
dbs_lst = [item['tableName'] for item in dbs_row]

for dbs_name in dbs_lst:
    try:
        descr = hive.sql("describe formatted {}.{}".format(CONN_SCHEMA, dbs_name)).collect()
        descr_info = [item.asDict() for item in descr if item.asDict()['col_name'] =='Location']
        owner_info = [item.asDict() for item in descr if item.asDict()['col_name'] =='Owner']
        hdfs_path  = dict(ChainMap(*descr_info)).get('data_type',None)
        owner      = dict(ChainMap(*owner_info)).get('data_type',None).split('@')[0]
        if hdfs_path is not None:
            hdfs_path_to_tbl = os.path.join(hdfs_path.rsplit('/',1)[0], dbs_name)
            print(hdfs_path_to_tbl, owner)

            size, size_fr = table_size(hdfs_path_to_tbl)
            print("Table: {} # HDFS Consumed Space --> {}".format(dbs_name, size))
            hdfs_dct['OWNER'].append(owner)
            hdfs_dct['HDFS_PATH'].append(hdfs_path_to_tbl)
            hdfs_dct['SIZE'].extend([size.strip() if size is not None else None])
            hdfs_dct['SIZE_FR'].extend([size_fr.strip() if size_fr is not None else None])
            
    except AnalysisException as ex:
        print(str(ex))

In [None]:
df = pd.DataFrame(hdfs_dct)
new = df.assign(f = lambda x: x['SIZE_FR'].apply(lambda y: float(y.split('Gb')[0]) if y is not None else y))\
        .sort_values(by='f', ascending=False).drop('f',axis=1).reset_index(drop=True)
    
new.head(20)

In [12]:
new.iloc[0:7].values

array([['hdfs://clsklsbx/user/team/team_cvm/hive/txn_2020_12',
        'tumanov1-ga_ca-sbrf-ru', '1462.8132 Gb', '2925.6264 Gb'],
       ['hdfs://clsklsbx/user/team/team_cvm/hive/mon_ai_fl_embedding_v3',
        'zhernokleev-ga_ca-sbrf-ru', '1035.6479 Gb', '2071.2959 Gb'],
       ['hdfs://clsklsbx/user/team/team_cvm/hive/clients_products_extract',
        'kovaleva4-oa_ca-sbrf-ru', '507.7636 Gb', '1024.8576 Gb'],
       ['hdfs://clsklsbx/user/team/team_cvm/hive/ok_smz_dataset',
        'kovaleva4-oa_ca-sbrf-ru', '276.9249 Gb', '553.8499 Gb'],
       ['hdfs://clsklsbx/user/team/team_cvm/hive/mmb_offer_proc_coe',
        'ektov1-av_ca-sbrf-ru', '165.4934 Gb', '330.9868 Gb'],
       ['hdfs://clsklsbx/user/team/team_cvm/hive/ma_mmb_offer_nontop',
        'ektov1-av_ca-sbrf-ru', '164.1438 Gb', '328.2877 Gb'],
       ['hdfs://clsklsbx/user/team/team_cvm/hive/churn_seasonal_features_v2_1_pilot_19_11',
        'geraskin1-iv_ca-sbrf-ru', '86.4890 Gb', '259.4670 Gb']],
      dtype=object)

In [15]:
new.to_excel("~/hdfs_team_cvm_disk_usage.xlsx")

## Make list of tables for purging

In [None]:
## tbls =[]

## Drop tables recursively

In [None]:
for table_name in tbls:
    hive.sql("drop table if exists {schema}.{tbl} purge".format(schema=CONN_SCHEMA, tbl=table_name))
    subprocess.call(['hdfs', 'dfs', '-rm', '-R', '-r', '-skipTrash', 
                     "hdfs://clsklsbx/user/team/team_cvm/hive/{}".format(table_name)])

## Calculate HDFS disk consumption

In [13]:
!hdfs dfs -du hdfs://clsklsbx/user/team/team_cvm/hive/ | awk '{s+=$2}END{print s/1000000000}'

17083.5
