# Lexicon Creation

In [14]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split
spark = SparkSession.builder \
        .appName("DE-prj") \
        .getOrCreate()
try:
    file_path = "DE-prj/MR_WC_Result/part-00000"
    
    # Read the file from HDFS
    df = spark.read.text(file_path)
    
    # Split each line by tab ('\t') into two columns: word and count
    df_split = df.select(split(df['value'], '\t').alias('word_count'))
    
    # Now, we need to create two separate columns: 'word' and 'count'
    df_final = df_split.select(
        df_split['word_count'].getItem(0).alias('word'),  # First item is the word
        df_split['word_count'].getItem(1).cast('int').alias('count')  # Second item is the count, cast it to integer
    )
    
    # Show the first few rows to verify
    df_final.show(truncate=False)
    
    # If you want to do further operations, such as aggregating the counts by word, you can do that
    word_counts = df_final.groupBy("word").sum("count").alias("total_count")
    total_word_count = df_final.agg({'count': 'sum'}).collect()[0][0]
    # Show the word counts
    word_counts.show(truncate=False)
    print(total_word_count)
finally:
    spark.stop()

24/12/15 00:10:36 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/12/15 00:10:36 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


+-----------+-----+
|word       |count|
+-----------+-----+
|aasad      |1    |
|ab         |7    |
|abad       |2    |
|abang      |17   |
|abbasi     |1    |
|abd        |8    |
|abdalrahman|1    |
|abdllah    |1    |
|abdul      |26   |
|abdullah   |15   |
|abiden     |1    |
|abu        |8    |
|abun       |1    |
|acara      |6    |
|acis       |1    |
|ackht      |2    |
|acp        |2    |
|act        |1    |
|acuan      |1    |
|ada        |139  |
+-----------+-----+
only showing top 20 rows

+--------------+----------+
|word          |sum(count)|
+--------------+----------+
|art           |1         |
|bersepadu     |7         |
|cerakah       |1         |
|chor          |1         |
|dianugerahkan |1         |
|direka        |1         |
|drgs          |4         |
|generasi      |4         |
|generik       |4         |
|gua           |29        |
|hantaran      |15        |
|hingga        |62        |
|input         |1         |
|jahat         |2         |
|kanak-kanak   |15