In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.master('local[*]').appName('Hamshahri_1').getOrCreate()

text_df_with_filename = spark.read.text("./Hamshahri/Hamshahri").withColumn("filename", substring_index(input_file_name(), "/", -1))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/15 16:53:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/03/15 16:53:12 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
24/03/15 16:53:12 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


In [2]:
def remove_stop_words(words, stop_words):
    return [word for word in words if word not in stop_words]

In [3]:
def clean_words(words):
    return [word.strip() for word in words]

In [16]:
import re
import pandas as pd

def tokenize_2gram_text(text):
    cleaned_text = re.sub(r'[^\w\s\u0600-\u06FF]', ' ', text)
    cleaned_text = re.sub(r'\؟', ' ', cleaned_text)
    cleaned_text = re.sub(r'\،', ' ', cleaned_text)
    words = cleaned_text.split()
    df = pd.DataFrame(words, columns=['word'])
    # print(len(words))
    with open('./Hamshahri/stopwords-fa.txt', 'r', encoding='utf-8') as file:
        stopWords = file.read().split("\n")
        stopWords = clean_words(stopWords)
        stopWords.extend(['text', 'title'])
        stopWords.extend(['كه', 'يا', 'اين'])
        words = remove_stop_words(words, stopWords)
        df['new_words'] = df['word'].apply(lambda x: None if x in stopWords else x)
    # if 'title' in words:
    #     words.remove('title')
    # if 'text' in words:
    #     words.remove('text')
    df.to_csv("output_text_file.txt", sep=' ')
    ngrams = []
    for i in range(len(words) - 1):
        ngram = words[i] + " " + words[i + 1]
        ngrams.append(ngram)
    return ngrams

In [None]:
text = "hello from isfahan Entekhab"
a = tokenize_2gram_text(text)
print(a)

In [17]:
tokenize_2gram_text_udf = udf(tokenize_2gram_text, ArrayType(StringType()))

In [18]:

specific_item = text_df_with_filename.filter(col("filename") == 'HAM2-750403-045.txt').select(col("value")).first()

if specific_item is not None:
    text_of_specific_item = specific_item['value']
    a = tokenize_2gram_text(text_of_specific_item)
    # print('Token: ', a)

In [19]:
tokenized_df = text_df_with_filename.withColumn("word", explode(tokenize_2gram_text_udf(col("value"))))


In [20]:
tokenized_df.show()

+--------------------+-------------------+--------------+
|               value|           filename|          word|
+--------------------+-------------------+--------------+
|{"title":"نام نوي...|HAM2-750405-016.txt|     نام نويسي|
|{"title":"نام نوي...|HAM2-750405-016.txt|    نويسي كدام|
|{"title":"نام نوي...|HAM2-750405-016.txt|    كدام مدرسه|
|{"title":"نام نوي...|HAM2-750405-016.txt|    مدرسه زياد|
|{"title":"نام نوي...|HAM2-750405-016.txt|      زياد سخت|
|{"title":"نام نوي...|HAM2-750405-016.txt|    سخت نگيريم|
|{"title":"نام نوي...|HAM2-750405-016.txt|    نگيريم صبح|
|{"title":"نام نوي...|HAM2-750405-016.txt|   صبح پنجشنبه|
|{"title":"نام نوي...|HAM2-750405-016.txt|    پنجشنبه 30|
|{"title":"نام نوي...|HAM2-750405-016.txt|      30 خرداد|
|{"title":"نام نوي...|HAM2-750405-016.txt|     خرداد ماه|
|{"title":"نام نوي...|HAM2-750405-016.txt|     ماه امسال|
|{"title":"نام نوي...|HAM2-750405-016.txt|     امسال يكي|
|{"title":"نام نوي...|HAM2-750405-016.txt| يكي خيابانهاي|
|{"title":"نام

                                                                                

In [21]:
result_df = tokenized_df.groupBy("word", "filename").agg(
    count("word").cast('int').alias("count")
)

grouped_df = result_df.groupBy('word').agg(collect_list(struct('filename', 'count').alias('Name Repeat Count')))

grouped_df.show(truncate=False)



+----------------+------------------------------------------------------------------------------+
|word            |collect_list(struct(filename, count) AS `Name Repeat Count`)                  |
+----------------+------------------------------------------------------------------------------+
|0 ميانه         |[{HAM2-750407-038.txt, 1}]                                                    |
|072 1987288آلمان|[{HAM2-750404-006.txt, 1}]                                                    |
|1 3             |[{HAM2-750405-016.txt, 1}]                                                    |
|1 6             |[{HAM2-750403-014.txt, 1}]                                                    |
|1 بار           |[{HAM2-750407-024.txt, 1}]                                                    |
|1 بامداد        |[{HAM2-750405-042.txt, 1}]                                                    |
|1 توسعه         |[{HAM2-750403-009.txt, 1}]                                                    |
|1 دلاربراي      |[{

                                                                                

In [22]:
result2_df = tokenized_df.groupBy("word").agg(
    count("word").cast('int').alias("count")
)

result2_df.orderBy(result2_df['count'].desc()).show()



+--------------+-----+
|          word|count|
+--------------+-----+
|        مي شود|  460|
|       شده است|  324|
|        مي كند|  263|
|       مي كنند|  166|
|      كرده است|  128|
|      خواهد شد|  120|
| جمهوري اسلامي|  118|
|   رئيس جمهوري|  107|
|   دانش آموزان|  106|
|       مي گويد|   97|
|        مي دهد|   91|
|    بين المللي|   88|
|خبرنگار همشهري|   87|
|     مي توانند|   81|
|      بوده است|   81|
| شهرداري تهران|   81|
|      هفتم تير|   77|
|       مي توان|   77|
|       مي شوند|   75|
|گزارش خبرگزاري|   75|
+--------------+-----+
only showing top 20 rows



                                                                                