In [1]:
from pyspark.sql import SparkSession

# Create Spark Session

In [4]:
import os

os.environ['PYSPARK_PYTHON'] = "C:\\Users\hemch\git\ml-yard\.venv\Scripts\python.exe"

spark = SparkSession.builder.appName('hashed_feature').master('local').getOrCreate()

In [5]:
# Access the SparkContext
sc = spark.sparkContext

# Get the Spark UI URL
spark_ui_url = sc.uiWebUrl
print("Spark UI:", spark_ui_url)

Spark UI: http://LAPTOP-SRAMQVR5:4040


# Load data

In [6]:
bangalore_housing_data = spark.read.option('header','true').csv('../data/Bengaluru_House_Data.csv')

In [17]:
display(bangalore_housing_data.limit(10).toPandas())

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2,1.0,51.0
5,Super built-up Area,Ready To Move,Whitefield,2 BHK,DuenaTa,1170,2,1.0,38.0
6,Super built-up Area,18-May,Old Airport Road,4 BHK,Jaades,2732,4,,204.0
7,Super built-up Area,Ready To Move,Rajaji Nagar,4 BHK,Brway G,3300,4,,600.0
8,Super built-up Area,Ready To Move,Marathahalli,3 BHK,,1310,3,1.0,63.25
9,Plot Area,Ready To Move,Gandhi Bazar,6 Bedroom,,1020,6,,370.0


# Check the number of values in location column

In [18]:
display(
    bangalore_housing_data
    .groupBy('location')
    .count()
    .limit(10000)
    .toPandas()
    .sort_values(by='count', ascending=False)
)

Unnamed: 0,location,count
969,Whitefield,540
452,Sarjapur Road,399
353,Electronic City,302
36,Kanakpura Road,273
834,Thanisandra,234
...,...,...
542,BEML Layout 5th Stage,1
543,Abshot Layout,1
883,T.C. Palya,1
544,Chikkaballapur,1


In [19]:
print(bangalore_housing_data.count())

13320


# Hash location feature

In [10]:
import farmhash

from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType

def hashed(input, bucket_size):
    def get_farmhash(_input):
        return farmhash.hash64withseed(_input, 12345)
    return abs(get_farmhash(input) % bucket_size)


hashed_udf = F.udf(hashed, IntegerType())
print(hashed('Bangalore', 3))

1


In [14]:
df_with_hash = (
    bangalore_housing_data
    .select('location')
    .withColumn('hashed_3', hashed_udf(F.col("location"),F.lit(3)))
    .withColumn('hashed_10', hashed_udf(F.col("location"),F.lit(10)))
    .withColumn('hashed_100', hashed_udf(F.col("location"),F.lit(100)))
)

df_with_hash.show()

+--------------------+--------+---------+----------+
|            location|hashed_3|hashed_10|hashed_100|
+--------------------+--------+---------+----------+
|Electronic City P...|       0|        4|        84|
|    Chikka Tirupathi|       0|        3|        93|
|         Uttarahalli|       1|        6|        66|
|  Lingadheeranahalli|       0|        8|        68|
|            Kothanur|       1|        3|        93|
|          Whitefield|       0|        5|        35|
|    Old Airport Road|       0|        8|        18|
|        Rajaji Nagar|       0|        3|        13|
|        Marathahalli|       0|        5|        85|
|        Gandhi Bazar|       1|        7|        87|
|          Whitefield|       0|        5|        35|
|          Whitefield|       0|        5|        35|
|  7th Phase JP Nagar|       0|        8|        18|
|           Gottigere|       1|        2|        32|
|            Sarjapur|       0|        5|         5|
|         Mysore Road|       1|        5|     