## Libraries

In [25]:
import json
import pandas as pd
import numpy as np
import csv
import re
import json
# from parsivar import Normalizer, Tokenizer, FindStems, POSTagger, FindChunks
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import argparse
import timeit
from persian_wordcloud.wordcloud import PersianWordCloud, add_stop_words
from wordcloud import STOPWORDS as EN_STOPWORDS
import seaborn as sns
sns.set()

## Importing the dataset

In [26]:
# list for loading data
data = []
data1 = []
for line in open('data-30.json', 'r', encoding='utf-8'):
    row = json.loads(line)
    # get body of each news
    data.append(row['Body'])
    data1.append(row)

news = data

In [27]:
# data1

In [28]:
news = data

In [29]:
news = pd.DataFrame(news)
news.rename(columns={0:'body'} , inplace=True)
news.head()

Unnamed: 0,body
0,به گزارش\r\nحوزه قرآن و عترت\r\nگروه فرهنگی با...
1,به گزارش\r\nگروه بین‌الملل باشگاه خبرنگاران جو...
2,به گزارش\r\nگروه بین‌الملل باشگاه خبرنگاران جو...
3,به گزارش\r\n\r\nگروه سیاسی باشگاه خبرنگاران جو...
4,به گزارش خبرنگار\r\nحوزه بهداشت و درمان\r\n\r\...


**Fetch word count for each body

In [30]:
news['word_count'] = news['body'].apply(lambda x: len(str(x).split(" ")))
news[['body','word_count']].head()

Unnamed: 0,body,word_count
0,به گزارش\r\nحوزه قرآن و عترت\r\nگروه فرهنگی با...,184
1,به گزارش\r\nگروه بین‌الملل باشگاه خبرنگاران جو...,345
2,به گزارش\r\nگروه بین‌الملل باشگاه خبرنگاران جو...,224
3,به گزارش\r\n\r\nگروه سیاسی باشگاه خبرنگاران جو...,708
4,به گزارش خبرنگار\r\nحوزه بهداشت و درمان\r\n\r\...,996


In [31]:
##Descriptive statistics of word counts
news.word_count.describe()

count     30.000000
mean     281.333333
std      204.296494
min       16.000000
25%      157.250000
50%      222.500000
75%      329.000000
max      996.000000
Name: word_count, dtype: float64

In [32]:
sum(news['body'].isna())

0

## cleaning data

In [33]:
RE_USELESS = r'[^\w]'  # remove useless characters
RE_DIGIT = r"^\d+\s|\s\d+\s|\s\d+$"  # remove digits
RE_SPACE = r'\s+'  # remove space
RE_EMAILS = r'[\w\.-]+@[\w\.-]+'
RE_URLS = r'http\S+'
RE_WWW = r'www\S+'



def clean_sentence(sentence):
    sentence = re.sub(r'[^\u0621-\u06ff]', ' ', sentence)
    sentence = arToPersianChar(sentence)
    sentence = arToPersianNumb(sentence)
    sentence = faToEnglishNumb(sentence)
    sentence = re.sub(r'[0-9]', ' ', sentence)
    sentence = re.sub(RE_WWW, r' ', sentence)
    sentence = re.sub(RE_URLS, r' ', sentence)
    sentence = re.sub(RE_EMAILS, r' ', sentence)
    sentence = re.sub(RE_USELESS, r' ', sentence)
    sentence = re.sub(RE_DIGIT, r' ', sentence)
    sentence = re.sub(RE_SPACE, r' ', sentence)
    sentence = re.sub(r'[a-zA-Z]', ' ', sentence)
    return sentence


def arToPersianNumb(number):
    dic = {
        '١': '۱',
        '٢': '۲',
        '٣': '۳',
        '٤': '۴',
        '٥': '۵',
        '٦': '۶',
        '٧': '۷',
        '٨': '۸',
        '٩': '۹',
        '٠': '۰',
    }
    return multiple_replace(dic, number)


def arToPersianChar(userInput):
    dic = {
        'ك': 'ک',
        'دِ': 'د',
        'بِ': 'ب',
        'زِ': 'ز',
        'ذِ': 'ذ',
        'شِ': 'ش',
        'سِ': 'س',
        'ى': 'ی',
        'ي': 'ی'
    }
    return multiple_replace(dic, userInput)


def faToEnglishNumb(number):
    dic = {
        '۰': '0',
        '۱': '1',
        '۲': '2',
        '۳': '3',
        '۴': '4',
        '۵': '5',
        '۶': '6',
        '۷': '7',
        '۸': '8',
        '۹': '9',
    }
    return multiple_replace(dic, number)


def multiple_replace(dic, text):
    pattern = "|".join(map(re.escape, dic.keys()))
    return re.sub(pattern, lambda m: dic[m.group()], str(text))

In [34]:
def clean_all(document):
    clean = ''
    for sentence in document:
        sentence = clean_sentence(sentence)
        clean += sentence
    return (clean)

## Remove stop words

In [35]:
import pandas as pd

read_file = pd.read_excel ('STOPWORDS.xlsx')
read_file.to_csv ('STOPWORDS.csv', index = None, header=True)

In [36]:
stopwords = list(pd.read_csv('STOPWORDS.csv',header=None)[0])
# stopwords

**finding_all_unique_words_and_freq

In [37]:
def finding_all_unique_words_and_freq(words):
#     words_unique = []
    word_freq = {}
#     for word in words:
#         if word not in words_unique:
#             words_unique.append(word)
    for word in words:
        word_freq[word] = words.count(word)
    return word_freq

## Preprocessing

In [38]:
import os
# all the unique words in the file is stored in word_freq_in_doc.
import hazm
dict_global = {}
files_with_index = {}
idx = 0
news1 = []
normalizer = hazm.Normalizer()
for review in news['body']:
    sentences = normalizer.normalize(clean_all(review))
    ##Convert to list from string
    itemtokenized = hazm.word_tokenize(sentences)
    lem = hazm.Lemmatizer()
    itemlemmatized = [lem.lemmatize(word) for word in itemtokenized if not word in stopwords] 
#     myStem = FindStems()
#     itemStemed = []
#     temp = []
#     for word in itemtokenized:
#         temp.append(myStem.convert_to_stem(word))
#     itemStemed.append(' '.join(temp))
#     print(len(itemlemmatized))
    text = " ".join(itemlemmatized)
    news1.append(text)
    dict_global.update(finding_all_unique_words_and_freq(itemtokenized))

#     fname = review
    files_with_index[idx] = os.path.basename(review)
    idx = idx + 1
    
unique_words_all = set(dict_global.keys())
len(unique_words_all)
# unique_words_all

2332

In [39]:
# unique_words_all

In [40]:
df = pd.DataFrame(unique_words_all)
df.rename(columns={0:'word'} , inplace=True)
df.to_csv('Words.csv')

## Preprocess finished

In [41]:
# len(dict_global.keys())

In [42]:
# len(dict_global.values())

In [43]:
# dict_global

In [44]:
word_freq_in_doc = dict_global

##  Dictionary

In [45]:
Dictionary = []
for i,doc in enumerate(news1):
    Dictionary += doc.split(" ")
# Dictionary

## PostingList


In [46]:
vocab = []
postings = {}

for i,doc in enumerate(news1):
    token_list = doc.split(' ')
    for word in token_list:
        c = token_list.count(word)
        if word not in postings.keys():
            dic ={i:c} 
            postings[word] = dic
            
        elif i not in postings[word].keys():
            a = postings[word]
            a[i] = c
            
print(len(postings.keys()))

1736


In [47]:
data_items = postings.items()
data_list = list(data_items)

df_postingList = pd.DataFrame(data_list)
df_postingList.rename(columns={0:'word'} , inplace=True)
df_postingList.rename(columns={1:'PostingList'} , inplace=True)
df_postingList

Unnamed: 0,word,PostingList
0,گزارش,"{0: 1, 1: 2, 2: 1, 3: 1, 4: 2, 5: 1, 6: 1, 7: ..."
1,حوزه,"{0: 1, 4: 5, 6: 1, 14: 1, 21: 1}"
2,قرآن,{0: 1}
3,عترت,{0: 1}
4,گروه,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 4, 5: 1, 6: 1, 7: ..."
...,...,...
1731,سراوان,{29: 1}
1732,خاش,{29: 1}
1733,نیکشهر,{29: 1}
1734,راسک,{29: 1}


### saving postingList to file

In [48]:
import numpy as np
df_postingList.to_csv(r'postingList.txt', header=None, index=None, sep=' ', mode='a')