# ANOVOS- GeoAuto Detection
**Following notebook shows the list of functions related to "gei_auto_detection" module provided under ANOVOS package**
- [ll gh cols](#ll_gh_cols)

**Setting Spark Session**

In [1]:
#set run type variable
run_type = "local" # "local", "emr", "databricks", "ak8s"

In [3]:
#For run_type Azure Kubernetes, run the following block 
import os
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

if run_type == "ak8s":
    fs_path="<insert conf spark.hadoop.fs master url here> ex: spark.hadoop.fs.azure.sas.<container>.<account_name>.blob.core.windows.net"
    auth_key="<insert value of sas_token here>"
    master_url="<insert kubernetes master url path here> ex: k8s://"
    docker_image="<insert name docker image here>"
    kubernetes_namespace ="<insert kubernetes namespace here>"

    # Create Spark config for our Kubernetes based cluster manager
    sparkConf = SparkConf()
    sparkConf.setMaster(master_url)
    sparkConf.setAppName("Anovos_pipeline")
    sparkConf.set("spark.submit.deployMode","client")
    sparkConf.set("spark.kubernetes.container.image", docker_image)
    sparkConf.set("spark.kubernetes.namespace", kubernetes_namespace)
    sparkConf.set("spark.executor.instances", "4")
    sparkConf.set("spark.executor.cores", "4")
    sparkConf.set("spark.executor.memory", "16g")
    sparkConf.set("spark.kubernetes.pyspark.pythonVersion", "3")
    sparkConf.set("spark.kubernetes.authenticate.driver.serviceAccountName", "spark")
    sparkConf.set(fs_path,auth_key)
    sparkConf.set("spark.kubernetes.authenticate.serviceAccountName", "spark")
    sparkConf.set("spark.jars.packages", "org.apache.hadoop:hadoop-azure:3.2.0,com.microsoft.azure:azure-storage:8.6.3,io.github.histogrammar:histogrammar_2.12:1.0.20,io.github.histogrammar:histogrammar-sparksql_2.12:1.0.20,org.apache.spark:spark-avro_2.12:3.2.1")

    # Initialize our Spark cluster, this will actually
    # generate the worker nodes.
    spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
    sc = spark.sparkContext

#For other run types import from anovos.shared.
else:
    from anovos.shared.spark import *
    auth_key = "NA"

**Input/Output Path**

In [4]:
inputPath = "../data/income_dataset/csv"

In [5]:
from anovos.data_ingest.data_ingest import read_dataset
from anovos.shared.utils import ends_with

In [6]:
df = read_dataset(spark, file_path = inputPath, file_type = "csv",file_configs = {"header": "True", 
                                                                           "delimiter": "," , 
                                                                           "inferSchema": "True"})
df = df.drop("dt_1", "dt_2")
df.toPandas().head(5)

                                                                                

Unnamed: 0,ifa,age,workclass,fnlwgt,logfnl,empty,education,education-num,marital-status,occupation,...,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,latitude,longitude,geohash
0,1a,,State-gov,77516.0,4.889391,,Bachelors,13.0,Never-married,Adm-clerical,...,White,Male,2174.0,0.0,40.0,UnitedStates,<=50K,-38.624096,177.982468,rb68np99
1,2a,,Self-emp-not-inc,83311.0,4.920702,,Bachelors,13.0,Married-civ-spouse,Exec-managerial,...,White,Male,0.0,0.0,13.0,UnitedStates,<=50K,-40.880497,174.992142,rckjypw0
2,3a,38.0,Private,215646.0,5.333741,,HS-grad,9.0,Divorced,Handlers-cleaners,...,White,Male,0.0,0.0,40.0,UnitedStates,<=50K,-37.73563,176.164047,rckm712q
3,4a,53.0,Private,234721.0,5.370552,,11th,7.0,Married-civ-spouse,Handlers-cleaners,...,Black,Male,0.0,0.0,40.0,UnitedStates,<=50K,-39.536491,176.832321,rckndgte
4,5a,,Private,338409.0,5.529442,,Bachelors,13.0,Married-civ-spouse,Prof-specialty,...,Black,Female,0.0,0.0,40.0,Cuba,<=50K,-41.128094,175.033722,rckq4596


# ll_gh_cols

In [7]:
from anovos.data_ingest.geo_auto_detection import ll_gh_cols

In [8]:
max_records = 100000

In [9]:
# Example 1 - only latitude column is in the input data
lat_cols, long_cols, gh_cols = ll_gh_cols(df.drop("geohash", "latitude"), max_records)
print(lat_cols, long_cols, gh_cols)

                                                                                

[] [] []


In [10]:
# Example 2 - only longitude column is in the input data
lat_cols, long_cols, gh_cols = ll_gh_cols(df.drop("geohash", "longitude"), max_records)
print(lat_cols, long_cols, gh_cols)

[] [] []


`ll_gh_cols` detects latitude & longitude columns only when they appear together in the dataset.

In [11]:
# Example 3 - latitude and longitude columns are in the input data, geohash column is not
lat_cols, long_cols, gh_cols = ll_gh_cols(df.drop("geohash"), max_records)
print(lat_cols, long_cols, gh_cols)

['latitude'] ['longitude'] []


In [12]:
geo_cols = lat_cols+long_cols+gh_cols
df.select(*geo_cols).toPandas()

Unnamed: 0,latitude,longitude
0,-38.624096,177.982468
1,-40.880497,174.992142
2,-37.735630,176.164047
3,-39.536491,176.832321
4,-41.128094,175.033722
...,...,...
32556,-41.293278,174.783737
32557,-45.855858,170.513382
32558,-37.743980,175.225586
32559,-37.750027,175.278122


In [13]:
# Example 4 - only geohash column is in the input data
lat_cols, long_cols, gh_cols = ll_gh_cols(df.drop("latitude", "longitude"), max_records)
print(lat_cols, long_cols, gh_cols)

[Stage 357:>                                                        (0 + 1) / 1]

[] [] ['geohash']


                                                                                

In [14]:
geo_cols = lat_cols+long_cols+gh_cols
df.select(*geo_cols).toPandas()

Unnamed: 0,geohash
0,rb68np99
1,rckjypw0
2,rckm712q
3,rckndgte
4,rckq4596
...,...
32556,rcm32hdg
32557,rb6b82me
32558,rckqh5tv
32559,rckkughm


In [15]:
# Example 5 - latitude, longitude, geohash columns are all in the input data
lat_cols, long_cols, gh_cols = ll_gh_cols(df, max_records)
print(lat_cols, long_cols, gh_cols)

['latitude'] ['longitude'] ['geohash']


In [16]:
geo_cols = lat_cols+long_cols+gh_cols
df.select(*geo_cols).toPandas()

Unnamed: 0,latitude,longitude,geohash
0,-38.624096,177.982468,rb68np99
1,-40.880497,174.992142,rckjypw0
2,-37.735630,176.164047,rckm712q
3,-39.536491,176.832321,rckndgte
4,-41.128094,175.033722,rckq4596
...,...,...,...
32556,-41.293278,174.783737,rcm32hdg
32557,-45.855858,170.513382,rb6b82me
32558,-37.743980,175.225586,rckqh5tv
32559,-37.750027,175.278122,rckkughm
