In [0]:
%python
from pyspark.sql import SparkSession
#Return Spark session
from pyspark.sql.session import SparkSession
def get_spark_session(app_name="spark_app_1"):
    try:
        spark = SparkSession.getActiveSession()
        if spark:
            return spark
    except:
        pass

    return (SparkSession.builder.appName(app_name).getOrCreate())



In [0]:
%python
def read_file(file_type:str='csv',path:str=None,header:bool=True,inferschema:bool=True,sep:str=None,linesep=None,schema:str=None,recursivefilelookup:bool=True,pathglobalfilter:str=None,modifiedbefore=None,modifiedafter=None,mode:str='PERMISSIVE',dateformat:str='yyyy-MM-dd',timestampformat:str='yyyy-MM-dd HH:mm:ss',mergeschema:str=True,multiline:str=False,table:str=None,is_malformed=False):


        
    if  file_type=='csv':
        return spark.read.options(header=header,inferSchema=inferschema,sep=sep,lineSep=linesep,recursiveFileLookup=recursivefilelookup,pathGlobFilter=pathglobalfilter,modifiedBefore=modifiedbefore,modifiedAfter=modifiedafter,mode=mode,dateFormat=dateformat,timestampFormat=timestampformat,schema=schema).csv(path)
    elif file_type=='json':
        return spark.read.options(recursiveFileLookup=recursivefilelookup,pathGlobFilter=pathglobalfilter,multiLine=multiline,mode=mode,dateFormat=dateformat,timestampFormat=timestampformat,modifiedBefore=modifiedbefore,modifiedAfter=modifiedafter,schema=schema).json(path)
    elif file_type=='orc' or file_type=='parquet' or file_type=='delta':
        return spark.read.options(recursiveFileLookup=recursivefilelookup,pathGlobFilter=pathglobalfilter,mergeSchema=mergeschema,modifiedBefore=modifiedbefore,modifiedAfter=modifiedafter).format(file_type).load(path)
    elif file_type=='table':
        return spark.read.table(table)
    else:
        print('file type not supported')
        return None

In [0]:
%python
def write_file(df,file_type:str='delta',path:str=None,mode:str='overwrite',table:str=None):
    if file_type in ['csv','json','orc','parquet','delta']:
        return df.write.mode(mode).format(file_type).save(path)
    elif file_type=='table':
        return df.write.mode(mode).saveAsTable(table)
    else:
        print('file type not supported')
        return None

In [0]:
%python
def merge_df(df1,df2,allowmissingcolumns=True):
  return df1.unionByName(df2,allowMissingColumns=allowmissingcolumns)

In [0]:
%python
from pyspark.sql.functions import lit,col
def add_column_with_default(df,column_name,default_value):
  return df.withColumn(f'{column_name}',lit(f'{default_value}'))

In [0]:
%python
def cleansing_func(df,duplicatedatacolumns:list=None,nulldropcolumns:list=[],nullstatergy='any'):
  df1=df.distinct()
  df2=df1.dropDuplicates(duplicatedatacolumns)
  df3=df2.dropna(how=nullstatergy,subset=nulldropcolumns)
  return df3

In [0]:
%python
%pip install word2number
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType
from word2number import w2n

def word_to_num(value):
    try:
        # If already numeric
        return int(value)
    except:
        try:
            return w2n.word_to_num(value.lower())
        except:
            return None

word_to_num_udf = udf(word_to_num, IntegerType())



In [0]:
def create_temp_view(df,tempviewname):
    return df.createOrReplaceTempView(tempviewname)

def df_from_temp_view(tempviewname):
    return spark.sql(f'select * from {tempviewname}')
   