## Libraries

In [1]:
import json
import pandas as pd
import numpy as np
import csv
import re
import json
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import argparse
import timeit
from persian_wordcloud.wordcloud import PersianWordCloud, add_stop_words
from wordcloud import STOPWORDS as EN_STOPWORDS
import seaborn as sns
sns.set()

## Importing the dataset

In [2]:
# list for loading data
data = []
data1 = []
for line in open('data-30.json', 'r', encoding='utf-8'):
    row = json.loads(line)
    # get body of each news
    data.append(row['Body'])


In [3]:
news = data

In [4]:
news = pd.DataFrame(news)
news.rename(columns={0:'body'} , inplace=True)
news.head()

Unnamed: 0,body
0,به گزارش\r\nحوزه قرآن و عترت\r\nگروه فرهنگی با...
1,به گزارش\r\nگروه بین‌الملل باشگاه خبرنگاران جو...
2,به گزارش\r\nگروه بین‌الملل باشگاه خبرنگاران جو...
3,به گزارش\r\n\r\nگروه سیاسی باشگاه خبرنگاران جو...
4,به گزارش خبرنگار\r\nحوزه بهداشت و درمان\r\n\r\...


**Fetch word count for each body

In [5]:
news['word_count'] = news['body'].apply(lambda x: len(str(x).split(" ")))
news[['body','word_count']].head()

Unnamed: 0,body,word_count
0,به گزارش\r\nحوزه قرآن و عترت\r\nگروه فرهنگی با...,184
1,به گزارش\r\nگروه بین‌الملل باشگاه خبرنگاران جو...,345
2,به گزارش\r\nگروه بین‌الملل باشگاه خبرنگاران جو...,224
3,به گزارش\r\n\r\nگروه سیاسی باشگاه خبرنگاران جو...,708
4,به گزارش خبرنگار\r\nحوزه بهداشت و درمان\r\n\r\...,996


In [6]:
##Descriptive statistics of word counts
news.word_count.describe()

count     30.000000
mean     281.333333
std      204.296494
min       16.000000
25%      157.250000
50%      222.500000
75%      329.000000
max      996.000000
Name: word_count, dtype: float64

In [7]:
sum(news['body'].isna())

0

## cleaning data

In [8]:
RE_USELESS = r'[^\w]'  # remove useless characters
RE_DIGIT = r"^\d+\s|\s\d+\s|\s\d+$"  # remove digits
RE_SPACE = r'\s+'  # remove space
RE_EMAILS = r'[\w\.-]+@[\w\.-]+'
RE_URLS = r'http\S+'
RE_WWW = r'www\S+'



def clean_sentence(sentence):
    sentence = re.sub(r'[^\u0621-\u06ff]', ' ', sentence)
    sentence = arToPersianChar(sentence)
    sentence = arToPersianNumb(sentence)
    sentence = faToEnglishNumb(sentence)
    sentence = re.sub(r'[0-9]', ' ', sentence)
    sentence = re.sub(RE_WWW, r' ', sentence)
    sentence = re.sub(RE_URLS, r' ', sentence)
    sentence = re.sub(RE_EMAILS, r' ', sentence)
    sentence = re.sub(RE_USELESS, r' ', sentence)
    sentence = re.sub(RE_DIGIT, r' ', sentence)
    sentence = re.sub(RE_SPACE, r' ', sentence)
    sentence = re.sub(r'[a-zA-Z]', ' ', sentence)
    return sentence


def arToPersianNumb(number):
    dic = {
        '١': '۱',
        '٢': '۲',
        '٣': '۳',
        '٤': '۴',
        '٥': '۵',
        '٦': '۶',
        '٧': '۷',
        '٨': '۸',
        '٩': '۹',
        '٠': '۰',
    }
    return multiple_replace(dic, number)


def arToPersianChar(userInput):
    dic = {
        'ك': 'ک',
        'دِ': 'د',
        'بِ': 'ب',
        'زِ': 'ز',
        'ذِ': 'ذ',
        'شِ': 'ش',
        'سِ': 'س',
        'ى': 'ی',
        'ي': 'ی'
    }
    return multiple_replace(dic, userInput)


def faToEnglishNumb(number):
    dic = {
        '۰': '0',
        '۱': '1',
        '۲': '2',
        '۳': '3',
        '۴': '4',
        '۵': '5',
        '۶': '6',
        '۷': '7',
        '۸': '8',
        '۹': '9',
    }
    return multiple_replace(dic, number)


def multiple_replace(dic, text):
    pattern = "|".join(map(re.escape, dic.keys()))
    return re.sub(pattern, lambda m: dic[m.group()], str(text))

In [9]:
def clean_all(document):
    clean = ''
    for sentence in document:
        sentence = clean_sentence(sentence)
        clean += sentence
    return (clean)

## Remove stop words

In [10]:
import pandas as pd

stopwords = list(pd.read_excel ('STOPWORDS.xlsx', header=None)[0])
# read_file.to_csv ('STOPWORDS.csv', index = None, header=None)
# stopwords

In [11]:
# stopwords = list(pd.read_csv('STOPWORDS.csv',header=None)[0])
# stopwords

**finding_all_unique_words_and_freq

In [12]:
def finding_all_unique_words_and_freq(words):
    words_unique = []
    word_freq = {}
    for word in words:
        if word not in words_unique:
            words_unique.append(word)
    for word in words:
        word_freq[word] = words.count(word)
    return word_freq

## Preprocessing

In [13]:
import os
# all the unique words in the file is stored in word_freq_in_doc.
import hazm
dict_global = {}
files_with_index = {}
idx = 0
news1 = []
normalizer = hazm.Normalizer()
for review in news['body']:
    sentences = normalizer.normalize(clean_all(review))
    ##Convert to list from string
    itemtokenized = hazm.word_tokenize(sentences)
    lem = hazm.Lemmatizer()
    itemlemmatized = []
    for word in itemtokenized:
        if word not in stopwords:
            itemlemmatized.append(lem.lemmatize(word))
#     myStem = FindStems()
#     itemStemed = []
#     temp = []
#     for word in itemtokenized:
#         temp.append(myStem.convert_to_stem(word))
#     itemStemed.append(' '.join(temp))
#     print(len(itemlemmatized))
    text = " ".join(itemlemmatized)
    news1.append(text)
    dict_global.update(finding_all_unique_words_and_freq(itemtokenized))

#     fname = review
    files_with_index[idx] = os.path.basename(review)
    idx = idx + 1
    
unique_words_all = set(dict_global.keys())
len(unique_words_all)
# unique_words_all

2332

In [14]:
# unique_words_all

In [15]:
df = pd.DataFrame(unique_words_all)
df.rename(columns={0:'word'} , inplace=True)
df.to_csv('Words.csv')

## Preprocess finished

In [16]:
# len(dict_global.keys())

In [17]:
# len(dict_global.values())

In [18]:
# dict_global

In [19]:
word_freq_in_doc = dict_global

##  Dictionary

In [20]:
Dictionary = []
for i,doc in enumerate(news1):
    Dictionary += doc.split(" ")
# Dictionary

## PostingList

In [21]:
vocab = []
postings = {}

for i,doc in enumerate(news1):
    token_list = doc.split(' ')
    for word in token_list:
        c = token_list.count(word)
        if word not in postings.keys():
            dic ={i:c} 
            postings[word] = dic
            
        elif i not in postings[word].keys():
            a = postings[word]
            a[i] = c
            
print(len(postings.keys()))

1736


In [22]:
data_items = postings.items()
data_list = list(data_items)

df_postingList = pd.DataFrame(data_list)
df_postingList.rename(columns={0:'word'} , inplace=True)
df_postingList.rename(columns={1:'PostingList'} , inplace=True)
df_postingList

Unnamed: 0,word,PostingList
0,گزارش,"{0: 1, 1: 2, 2: 1, 3: 1, 4: 2, 5: 1, 6: 1, 7: ..."
1,حوزه,"{0: 1, 4: 5, 6: 1, 14: 1, 21: 1}"
2,قرآن,{0: 1}
3,عترت,{0: 1}
4,گروه,"{0: 1, 1: 1, 2: 1, 3: 1, 4: 4, 5: 1, 6: 1, 7: ..."
...,...,...
1731,سراوان,{29: 1}
1732,خاش,{29: 1}
1733,نیکشهر,{29: 1}
1734,راسک,{29: 1}


### saving postingList to file

In [23]:
import numpy as np
df_postingList.to_csv(r'postingList.txt', header=None, index=None, sep=' ', mode='a')

## Boolean_Matrix

In [24]:
import pandas as pd
values = list(set([ x for y in postings.values() for x in y]))
data = {}
for key in sorted(postings.keys()):
    data[key] = [ 1 if value in postings[key] else 0 for value in values ]
boolean_matrix = pd.DataFrame(data, index=values)
boolean_matrix.head()

Unnamed: 0,آئین,آب,آبرو,آتا,آتش,آتی,آثار,آخر,آخرین,آداب,...,یادآور,یادماندنی‌ترین,یارانه,یافت#یاب,ید,یدک,یمن,یون,یونسکو,یکپارچگی
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0


# BooleanModel

In [25]:
# query = input('## Enter your query: ')
# tokenized_query = hazm.word_tokenize(query)
# lemmatized_query = [lem.lemmatize(word) for word in tokenized_query if not word in stopwords] 
# query = lemmatized_query

In [26]:
# CountOfNews = 30
# Finalvector =[1]*CountOfNews
# for word in query:
#     print('word: ',word)
#     QueryVector =[0]*CountOfNews
#     if word in postings.keys():
#         doc_freq = postings[word]
#         for k in doc_freq:
#             QueryVector[k]= 1 
#         print('WordVector', QueryVector)    
#     QueryVector = np.array(QueryVector) 
#     Finalvector *= QueryVector
# Finalvector = list(Finalvector)
# print('FinalVector', Finalvector)
# for  i,elm in enumerate(Finalvector):
#     if elm == 1:
#         print("document: ")
#         print(news1[i])   

In [27]:
# CountOfNews = 30
# Finalvector =[0]*CountOfNews
# for word in query:
#     print('word: ',word)
#     QueryVector =[0]*CountOfNews
#     if word in postings.keys():
#         doc_freq = postings[word]
#         for k in doc_freq:
#             QueryVector[k]= 1 
#         print('WordVector', QueryVector)    
#     QueryVector = np.array(QueryVector) 
#     Finalvector += QueryVector
# Finalvector = list(Finalvector)
# print('FinalVector', Finalvector)
# for  i,elm in enumerate(Finalvector):
#     if elm > 0:
#         print("document: ")
#         print(news1[i])

In [28]:
# CountOfNews = 30
# Finalvector =[1]*CountOfNews
# for word in query:
#     print('word: ',word)
#     QueryVector =[0]*CountOfNews
#     if word in postings.keys():
#         doc_freq = postings[word]
#         for k in doc_freq:
#             QueryVector[k]= 1 
#         print('WordVector', QueryVector)    
#     QueryVector = np.array(QueryVector) 
#     Finalvector += -1 * QueryVector
# Finalvector = list(Finalvector)
# print('FinalVector', Finalvector)
# for  i,elm in enumerate(Finalvector):
#     if elm == 1:
#         print("document: ")
#         print(news1[i])

In [29]:
# data = pd.DataFrame(data)
# data.rename(columns={0:'body'} , inplace=True)
# data.head()
# data['word_count'] = data['body'].apply(lambda x: len(str(x).split(" ")))
# data[['body','word_count']].head()

In [30]:
# news1 = pd.DataFrame(news1)
# news1.rename(columns={0:'body'} , inplace=True)
# news1.head()
# news1['word_count'] = news1['body'].apply(lambda x: len(str(x).split(" ")))
# news1[['body','word_count']].head()

In [31]:
postings['گزارش'][12]

1

# tf-idf Model

In [32]:
values = list(set([ x for y in postings.values() for x in y]))
data = {}
for key in sorted(postings.keys()):
    data[key] = [ postings[key][value] if value in postings[key] else 0 for value in values ]
tfTemp = pd.DataFrame(data, index=values)
tfTemp.head()

Unnamed: 0,آئین,آب,آبرو,آتا,آتش,آتی,آثار,آخر,آخرین,آداب,...,یادآور,یادماندنی‌ترین,یارانه,یافت#یاب,ید,یدک,یمن,یون,یونسکو,یکپارچگی
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,3,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0


In [33]:
cv = CountVectorizer()
cv_fit = cv.fit_transform(news1)

names = cv.get_feature_names()  # This are the entity names (i.e. keywords)
df = pd.DataFrame(data=cv_fit.toarray(), columns=names)
# df.to_csv('outputOfWordFrequency.csv', sep=',', encoding="utf-8")
df.head()

Unnamed: 0,آئین,آب,آبرو,آتا,آتش,آتی,آثار,آخر,آخرین,آداب,...,یادآور,یادماندنی,یارانه,یافت,ید,یدک,یمن,یون,یونسکو,یکپارچگی
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,3,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0


# More ...

## Most common and uncommon words

In [34]:
reviews1 = list(news['body'])

In [35]:
freq = pd.Series(' '.join(news1).split()).value_counts()
freq.to_frame()

Unnamed: 0,0
سال,84
استان,77
خبرنگار,47
کرد#کن,40
جوان,38
...,...
سلف,1
خط,1
نابلس,1
وحشیانه,1


In [36]:
#Identify common words
f = list(pd.DataFrame(freq).index)
f

['سال',
 'استان',
 'خبرنگار',
 'کرد#کن',
 'جوان',
 'گروه',
 'پیام',
 'گزارش',
 'کشور',
 'نوروز',
 'باشگاه',
 'ایران',
 'تولد',
 'انتهای',
 'تاریخ',
 'شد#شو',
 'ماه',
 'نوزاد',
 'رئیس',
 'افزود#افزا',
 'آب',
 'عشایر',
 'گردشگری',
 'برنامه',
 'قرار',
 'همدان',
 'داد#ده',
 'دست',
 'شرکت',
 'اسلام',
 'ترامپ',
 'اقتصاد',
 'سازمان',
 'افزایش',
 'ملت',
 'آمار',
 'شهر',
 'داشت#دار',
 'ادامه',
 'تخصیص',
 'طرح',
 'معاون',
 'بهداشت',
 'ثبت',
 'بیان',
 'کرمان',
 'آن',
 'بود#باش',
 'حدود',
 'استاندار',
 'میلیارد',
 'مدیر',
 'مراسم',
 'تعداد',
 'دشمن',
 'مدرسه',
 'نفر',
 'کره',
 'دانش',
 'همراه',
 'تومان',
 'آمریکا',
 'توجه',
 'تولید',
 'جمهوری',
 'مسئله',
 'اتوبوس',
 'مسافر',
 'پلیس',
 'کرمانشاه',
 'منابع',
 'وزارت',
 'پایگاه',
 'امیدوار',
 'اطلاعات',
 'نیرو',
 'نسبت',
 'کودک',
 'فیلم',
 'برگزار',
 'ع',
 'طبیعت',
 'انقلاب',
 'کار',
 'لاکچری',
 'حضور',
 'سوریه',
 'توسعه',
 'نظر',
 'جشن',
 'شهید',
 'ستاد',
 'دستگاه',
 'همکار',
 'حوزه',
 'درصد',
 'عزیز',
 'دانست#دان',
 'ملی',
 'میلیون',
 'تبریک',
 'مش

In [37]:
#Identify uncommon words
freq1 =  pd.Series(' '.join(news1).split()).value_counts()[-20:]
freq1

مقاوم       1
سازی        1
سومین       1
جلسات       1
شبانگاهی    1
العهد       1
اسماعیل     1
دفتر        1
جنبش        1
حماس        1
بزرگداشت    1
عمر         1
ابولیلی     1
مجری        1
طلبانه      1
سلف         1
خط          1
نابلس       1
وحشیانه     1
سرباز       1
dtype: int64

## top_n_frequent_words

In [38]:
from matplotlib import font_manager as fm, rcParams
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]
#Convert most freq words to dataframe for plotting bar plot
top_words = get_top_n_words(news1, n=20)
top_df = pd.DataFrame(top_words)
top_df.columns=["Freq", "Word"]
top_df


Unnamed: 0,Freq,Word
0,سال,84
1,استان,77
2,خبرنگار,47
3,کرد,40
4,کن,40
5,جوان,38
6,گروه,36
7,پیام,36
8,گزارش,35
9,کشور,35


## top_n2_frequent_words

In [39]:
# Most frequently occuring Bi-grams
def get_top_n2_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(2,2),  
            max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top2_words = get_top_n2_words(news1, n=20)
top2_df = pd.DataFrame(top2_words)
top2_df.columns=["Bi-gram","Freq"]
top2_df

Unnamed: 0,Bi-gram,Freq
0,کرد کن,40
1,باشگاه خبرنگار,33
2,خبرنگار جوان,33
3,انتهای پیام,30
4,شد شو,29
5,افزود افزا,22
6,گزارش گروه,19
7,داد ده,18
8,داشت دار,15
9,بود باش,13


## top_n3_frequent_words

In [40]:
#Most frequently occuring Tri-grams
def get_top_n3_words(corpus, n=None):
    vec1 = CountVectorizer(ngram_range=(3,3), 
           max_features=2000).fit(corpus)
    bag_of_words = vec1.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     
                  vec1.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                reverse=True)
    return words_freq[:n]
top3_words = get_top_n3_words(news1, n=20)
top3_df = pd.DataFrame(top3_words)
top3_df.columns=["Tri-gram", "Freq"]
top3_df


Unnamed: 0,Tri-gram,Freq
0,باشگاه خبرنگار جوان,33
1,گروه استان باشگاه,12
2,استان باشگاه خبرنگار,12
3,خبرنگار جوان نقل,9
4,خبرنگار گروه استان,7
5,گزارش خبرنگار گروه,6
6,حدود میلیارد تومان,6
7,گزارش گروه الملل,5
8,گروه الملل باشگاه,5
9,الملل باشگاه خبرنگار,5


### stopwords in English

In [41]:
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# print(stopwords.words('english'))

In [42]:
news2 = pd.DataFrame(news1)
news2.rename(columns={0:'body'} , inplace=True)
news2['word_count'] = news2['body'].apply(lambda x: len(str(x).split(" ")))
news2[['body','word_count']].head()

Unnamed: 0,body,word_count
0,گزارش حوزه قرآن عترت گروه فرهنگ باشگاه خبرنگار...,116
1,گزارش گروه الملل باشگاه خبرنگار جوان نقل رویتر...,201
2,گزارش گروه الملل باشگاه خبرنگار جوان نقل پایگا...,129
3,گزارش گروه سیاسی باشگاه خبرنگار جوان حضرت آیت ...,414
4,گزارش خبرنگار حوزه بهداشت درمان گروه علم پزشک ...,591


In [43]:
boolean_matrix.T.sum()

0      85
1     137
2      84
3     251
4     259
5      86
6      76
7     163
8      73
9     123
10     48
11     60
12     45
13     63
14    186
15    116
16     81
17     69
18    121
19    108
20     17
21     79
22    223
23    130
24     72
25     83
26     65
27     92
28     69
29     90
dtype: int64

In [44]:
def tfCalculator(textList, boolean_matrix):
    values = list(set([ x for y in postings.values() for x in y]))
    data = {}
    for key in sorted(postings.keys()):
        data[key] = [ postings[key][value] if value in postings[key] else 0 for value in values ]
    dff = pd.DataFrame(data, index=values)
#     for i in range(len(dff)):
#         dff.iloc[i,:] = dff.iloc[i,:] / boolean_matrix.T.sum()[i]
    return dff
tf = tfCalculator(news1, boolean_matrix)
tf

Unnamed: 0,آئین,آب,آبرو,آتا,آتش,آتی,آثار,آخر,آخرین,آداب,...,یادآور,یادماندنی‌ترین,یارانه,یافت#یاب,ید,یدک,یمن,یون,یونسکو,یکپارچگی
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,3,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,1,1,0,0
5,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
def tf_wtCalculator(tf):
    
    return (1 + np.log10(tf).replace(-np.inf, -1))
tf_wt = tf_wtCalculator(tf)
tf_wt

Unnamed: 0,آئین,آب,آبرو,آتا,آتش,آتی,آثار,آخر,آخرین,آداب,...,یادآور,یادماندنی‌ترین,یارانه,یافت#یاب,ید,یدک,یمن,یون,یونسکو,یکپارچگی
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.778151,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.477121,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.30103,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
def lenDocs(wt):
    return np.sqrt(np.square(wt).T.sum())

In [60]:
def NormWT(wt, lenList):
    return wt / lenList

In [61]:
# # calculating tf-idf
# tfIdfVectorizer = TfidfVectorizer()
# responseTfIdf = tfIdfVectorizer.fit_transform(news1)
# # writerTfIdf.writerows(responseTfIdf)
# # print(responseTfIdf)
# df1 = pd.DataFrame(data=responseTfIdf.toarray(), columns=names)
# df1.head()

In [62]:
# query = input('## Enter your query: ')
# tokenized_query = hazm.word_tokenize(query)
# lemmatized_query = [lem.lemmatize(word) for word in tokenized_query if not word in stopwords] 
# query = lemmatized_query

In [63]:
# queryTF  = dfCalculator([query])
# # queryTF 

In [65]:
def dfTextCalculator(postings):
    DF = []
    for  word in sorted(postings.keys()):
        DF.append(len(postings[word]))
    
    return pd.DataFrame(np.reshape(DF, (1,-1)), columns=list(sorted(postings.keys())))

df = dfTextCalculator(postings)
df

Unnamed: 0,آئین,آب,آبرو,آتا,آتش,آتی,آثار,آخر,آخرین,آداب,...,یادآور,یادماندنی‌ترین,یارانه,یافت#یاب,ید,یدک,یمن,یون,یونسکو,یکپارچگی
0,1,5,1,1,1,1,2,1,3,1,...,2,1,1,1,1,1,1,1,1,1


In [67]:
def idfCalculator(dfMatrix, lenNews):
#     for col in dfMatrix.columns:
    return (1 + np.log10(lenNews) / dfMatrix).replace(np.inf, np.nan).dropna(axis=1)
newsidf = idfCalculator(dfTextCalculator(postings), len(news1))
newsidf

Unnamed: 0,آئین,آب,آبرو,آتا,آتش,آتی,آثار,آخر,آخرین,آداب,...,یادآور,یادماندنی‌ترین,یارانه,یافت#یاب,ید,یدک,یمن,یون,یونسکو,یکپارچگی
0,2.477121,1.295424,2.477121,2.477121,2.477121,2.477121,1.738561,2.477121,1.492374,2.477121,...,1.738561,2.477121,2.477121,2.477121,2.477121,2.477121,2.477121,2.477121,2.477121,2.477121


In [68]:
def tf_idf(tf, newsidf):
    return pd.DataFrame(np.multiply(tf, newsidf).sum()).sort_values(0, ascending=False)
tf_idf(tf, newsidf)

  return pd.DataFrame(np.multiply(tf, newsidf).sum()).sort_values(0, ascending=False)


Unnamed: 0,0
سال,90.893233
استان,84.582556
تولد,79.267880
نوزاد,59.450910
عشایر,52.019546
...,...
رغم,2.477121
رعایت,2.477121
رضایت,2.477121
رسیدگی,2.477121


In [69]:
def dfQueryCalculator(textList):
    DF = []
    for  word in textList[0].split(' '):
        try:
            DF.append(len(postings[word]))
        except:
            DF.append(0)
    return pd.DataFrame(np.reshape(DF, (1,-1)), columns=textList[0].split(' '))

In [70]:
def tfQueryCalculator(textList, boolean_matrix):
    cv = CountVectorizer()
    cv_fit = cv.fit_transform(textList)
    names = cv.get_feature_names()
    dff = pd.DataFrame(data=cv_fit.toarray(), columns=names)
#     for i in range(len(dff)):
#         dff.iloc[i,:] = dff.iloc[i,:] / textLen
    return dff

In [75]:
def wtCalculator(command):
    querytf = tfQueryCalculator([query], boolean_matrix)
    tf = tfCalculator(news1, boolean_matrix)
    tf_wt = tf_wtCalculator(tf)
    querytf_wt = tf_wtCalculator(querytf)
    if command[0] == 


'ltc.lnc'

In [57]:
query = input('## Enter your query: ')
command = input('Please, Enter your command: ')
tokenized_query = hazm.word_tokenize(query)
query = ''
for word in tokenized_query:
    if not word in stopwords:
        query = query + ' ' +lem.lemmatize(word)
[query]       

## Enter your query: ایران «اطلاعات حساسی» را در اختیار دارد
Please, Enter your command: ltc.lnc


[' ایران « اطلاعات حساس » اختیار']

In [71]:
dfQueryCalculator([query])

Unnamed: 0,Unnamed: 1,ایران,«,اطلاعات,حساس,»,اختیار
0,0,12,0,2,1,0,3


In [72]:
queryidf = idfCalculator(dfQueryCalculator([query]), len(news1))
querytf = tfQueryCalculator([query], boolean_matrix)
qureyTF_IDF = tf_idf(querytf, queryidf)
qureyTF_IDF

  return pd.DataFrame(np.multiply(tf, newsidf).sum()).sort_values(0, ascending=False)


Unnamed: 0,0
ایران,2.477121
اطلاعات,1.738561
حساس,1.492374
اختیار,1.123093


In [73]:
np.multiply(querytf , queryidf)

  np.multiply(querytf , queryidf)


Unnamed: 0,اختیار,اطلاعات,ایران,حساس
0,1.123093,1.738561,2.477121,1.492374


In [74]:
querytf

Unnamed: 0,اختیار,اطلاعات,ایران,حساس
0,1,1,1,1


In [332]:
queryidf

Unnamed: 0,ایران,اطلاعات,حساس,اختیار
0,1.123093,1.738561,2.477121,1.492374


# khodam

In [55]:
query = input('## Enter your query: ')
tokenized_query = hazm.word_tokenize(query)
query = ''
for word in tokenized_query:
    if not word in stopwords:
        query = query + ' ' +lem.lemmatize(word)
        

## Enter your query: دانشگاه امام صادق


In [56]:
def tfCalculator(textList, boolean_matrix):
    cv = CountVectorizer()
    cv_fit = cv.fit_transform(textList)
    names = cv.get_feature_names()
    dff = pd.DataFrame(data=cv_fit.toarray(), columns=names)
    for i in range(len(dff)):
        dff.iloc[i,:] = dff.iloc[i,:]
    return dff
tf = tfCalculator([query], boolean_matrix)
tf.head()

Unnamed: 0,امام,دانشگاه,صادق
0,1,1,1


In [57]:
def dfCalculator(postings):
    DF = []
    for  word in tokenized_query:
        DF.append(len(postings[word]))
    
    return pd.DataFrame(np.reshape(DF, (1,-1)), columns=tokenized_query)

df = dfCalculator(postings)
df

Unnamed: 0,دانشگاه,امام,صادق
0,1,4,2


In [58]:
def idfCalculator(dfMatrix, lenNews):
#     for col in dfMatrix.columns:
    return (1 + np.log10(lenNews / dfMatrix))
newsidf = idfCalculator(dfCalculator(postings), len(query))
newsidf

Unnamed: 0,دانشگاه,امام,صادق
0,2.255273,1.653213,1.954243


In [59]:
CountOfNews = 30
Finalvector =[0]*CountOfNews
for word in tokenized_query:
    print('word: ',word)
    QueryVector =[0]*CountOfNews
    if word in postings.keys():
        doc_freq = postings[word]
        for k in doc_freq:
            QueryVector[k]= 1 
        print('WordVector', QueryVector)    
    QueryVector = np.array(QueryVector) 
    Finalvector += QueryVector
Finalvector = list(Finalvector)
print('FinalVector', Finalvector)
print("documents that contains tokens_query: ")
for  i,elm in enumerate(Finalvector):
    if elm > 0:
        print(i)
        tf = tfCalculator([news1[i]], boolean_matrix)

word:  دانشگاه
WordVector [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
word:  امام
WordVector [0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
word:  صادق
WordVector [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
FinalVector [0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 0]
documents that contains tokens_query: 
3
4
7
14
22
28


In [60]:
for  i,elm in enumerate(Finalvector):
    tf = tfCalculator([news1[i]], boolean_matrix)

In [61]:
tf0 = tfCalculator([news1[0]], boolean_matrix)
tf0

Unnamed: 0,آخرین,آغاز,اخلاق,ادامه,اساتید,استان,اسلام,اعتکاف,امامزادگان,اماکن,...,ولادت,پرشور,پیام,چشمگیر,کرد,کشور,کن,گروه,گزارش,گونه
0,1,2,1,1,1,1,1,5,1,2,...,1,1,1,1,2,1,2,1,1,1


In [62]:
tf1 = tfCalculator([news1[1]], boolean_matrix)
tf1

Unnamed: 0,آلمان,اتهام,اختیار,ادامه,ارائه,ارتش,اسرائیل,اشاره,اصل,اطلاع,...,گذار,گذاشت,گرفت,گروه,گزارش,گفت,گفتگو,گو,گیر,ید
0,1,2,2,1,1,1,1,1,1,1,...,1,1,1,1,2,1,2,1,1,1
