In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.master('local[*]').appName('Hamshahri_1').getOrCreate()

text_df_with_filename = spark.read.text("./Hamshahri/Hamshahri").withColumn("filename", substring_index(input_file_name(), "/", -1))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/15 17:00:13 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/15 17:00:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/03/15 17:00:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
24/03/15 17:00:14 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.


In [2]:
def remove_stop_words(words, stop_words):
    return [word for word in words if word not in stop_words]

In [3]:
import re
import pandas as pd

def tokenize_2gram_text(text):
    cleaned_text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
    cleaned_text = re.sub(r'\؟', ' ', cleaned_text)
    cleaned_text = re.sub(r'\،', ' ', cleaned_text)
    words = cleaned_text.split()
    df = pd.DataFrame(words, columns=['word'])
    # print(len(words))
    with open('./Hamshahri/stopwords-fa.txt', 'r', encoding='utf-8') as file:
        stopWords = file.read().split("\n")
        stopWords.extend(['text', 'title'])
        stopWords.extend(['كه', 'يا', 'اين'])
        words = remove_stop_words(words, stopWords)
        df['new_words'] = df['word'].apply(lambda x: None if x in stopWords else x)
    # if 'title' in words:
    #     words.remove('title')
    # if 'text' in words:
    #     words.remove('text')
    df.to_csv("output_text_file.txt", sep=' ')
    ngrams = []
    for i in range(len(words) - 2):
        ngram = words[i] + " " + words[i + 1] + " " + words[i + 2]
        ngrams.append(ngram)
    return ngrams

In [4]:
text = "hello from isfahan Entekhab"
a = tokenize_2gram_text(text)
print(a)

['hello from isfahan', 'from isfahan Entekhab']


In [5]:
tokenize_2gram_text_udf = udf(tokenize_2gram_text, ArrayType(StringType()))

In [18]:

specific_item = text_df_with_filename.filter(col("filename") == 'HAM2-750403-045.txt').select(col("value")).first()

if specific_item is not None:
    text_of_specific_item = specific_item['value']
    a = tokenize_2gram_text(text_of_specific_item)
    # print('Token: ', a)

In [6]:
tokenized_df = text_df_with_filename.withColumn("word", explode(tokenize_2gram_text_udf(col("value"))))


In [7]:
tokenized_df.show()

                                                                                

+--------------------+-------------------+--------------------+
|               value|           filename|                word|
+--------------------+-------------------+--------------------+
|{"title":"نام نوي...|HAM2-750405-016.txt|      نام نويسي كدام|
|{"title":"نام نوي...|HAM2-750405-016.txt|    نويسي كدام مدرسه|
|{"title":"نام نوي...|HAM2-750405-016.txt|     كدام مدرسه زياد|
|{"title":"نام نوي...|HAM2-750405-016.txt|      مدرسه زياد سخت|
|{"title":"نام نوي...|HAM2-750405-016.txt|     زياد سخت نگيريم|
|{"title":"نام نوي...|HAM2-750405-016.txt|      سخت نگيريم صبح|
|{"title":"نام نوي...|HAM2-750405-016.txt|  نگيريم صبح پنجشنبه|
|{"title":"نام نوي...|HAM2-750405-016.txt|      صبح پنجشنبه 30|
|{"title":"نام نوي...|HAM2-750405-016.txt|    پنجشنبه 30 خرداد|
|{"title":"نام نوي...|HAM2-750405-016.txt|        30 خرداد ماه|
|{"title":"نام نوي...|HAM2-750405-016.txt|     خرداد ماه امسال|
|{"title":"نام نوي...|HAM2-750405-016.txt|       ماه امسال يكي|
|{"title":"نام نوي...|HAM2-750405-016.tx

In [8]:
result_df = tokenized_df.groupBy("word", "filename").agg(
    count("word").cast('int').alias("count")
)

grouped_df = result_df.groupBy('word').agg(collect_list(struct('filename', 'count').alias('Name Repeat Count')))

grouped_df.show(truncate=False)



+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
|word                            |collect_list(struct(filename, count) AS `Name Repeat Count`)                                                                      |
+--------------------------------+----------------------------------------------------------------------------------------------------------------------------------+
|0 درصد معادل                    |[{HAM2-750409-018.txt, 1}]                                                                                                        |
|000 000 180                     |[{HAM2-750409-007.txt, 1}]                                                                                                        |
|000 180 ليتر                    |[{HAM2-750409-007.txt, 1}]                                                                                                        |
|048

                                                                                

In [10]:
result2_df = tokenized_df.groupBy("word").agg(
    count("word").cast('int').alias("count")
)

result2_df.orderBy(result2_df['count'].desc()).show(truncate=False)



+----------------------+-----+
|word                  |count|
+----------------------+-----+
|برگزار مي شود         |50   |
|جمهوري اسلامي ايران   |43   |
|گزارش خبرگزاري فرانسه |41   |
|خبرگزاري جمهوري اسلامي|41   |
|آقاي هاشمي رفسنجاني   |33   |
|مجلس شوراي اسلامي     |33   |
|جام ملتهاي اروپا      |29   |
|رسانه هاي خارجي       |26   |
|واحد رسانه هاي        |24   |
|گزارش روابط عمومي     |23   |
|تيم ملي فوتبال        |23   |
|گزارش خبرگزاري جمهوري |21   |
|سرويس علمي فرهنگي     |20   |
|مي شود سرويس          |20   |
|زندگي مي كنند         |19   |
|گزارش خبرنگار همشهري  |19   |
|كرد گزارش خبرگزاري    |19   |
|جام ملتهاي آسيا       |18   |
|صورت مي گيرد          |18   |
|شهداي هفتم تير        |18   |
+----------------------+-----+
only showing top 20 rows



                                                                                