# Information retrieval

## Home Work 1

## Persian Text Pre-processing

#### Ali Mojahed - 9812762554
#### Mehrnoosh Navidimehr - 9822762119
#### Minoo Mohaghegh - 9812762270
#### Helia Ghahraman - 9822762437



# Load Dataset

In [None]:
!git clone https://github.com/Text-Mining/Useful-Corpora-for-Text-Mining-in-Persian-Language.git
!unrar x '/content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar'

fatal: destination path 'Useful-Corpora-for-Text-Mining-in-Persian-Language' already exists and is not an empty directory.

UNRAR 5.61 beta 1 freeware      Copyright (c) 1993-2018 Alexander Roshal


Extracting from /content/Useful-Corpora-for-Text-Mining-in-Persian-Language/News/FarsNews 97/farsnews.part01.rar


Would you like to replace the existing file farsnews.json
1314018076 bytes, modified on 2019-07-16 10:46
with a new one
1314018076 bytes, modified on 2019-07-16 10:46

[Y]es, [N]o, [A]ll, n[E]ver, [R]ename, [Q]uit 

In [None]:
import json
news = []
counter = 0
for line in open('/content/farsnews.json', 'r', encoding='utf-8-sig'):
  news.append(json.loads(line))

# Pre-processing Pipeline

In [None]:
import pandas as pd
# will be used to remove stopwords from tokens
stopwords = pd.read_fwf('https://raw.githubusercontent.com/sobhe/hazm/master/hazm/data/stopwords.dat', header=None)[0].to_list()

In [None]:
import re
from string import punctuation as punctuation_str

def remove_symbols_and_numbers(content):
  end_of_msg = ('انتهای پیام', 'انتهای‌پیام', '\r\nانتهای\r\nپیام/ک')
  for w in end_of_msg:
      if w in content[-200:]:
          i = content[-200:].find(w)
          content = content[:-(200-i)-1]

  # Remove punctuation & Numbers
  content = re.sub(f'[{punctuation_str}؟!،,?،٪×÷»«><]', '', content)
  content = re.sub(f'[0123456789۰١۱۲۳۴۵۶۷۸۹؛–_‘]', '', content)

  return content

In [None]:
def pipeline(text, normalizer, tokenizer, stemmer, method_name):
  result = dict()

  no_symbol_text = remove_symbols_and_numbers(text)
  normalized_text = normalizer(no_symbol_text)

  tokens = tokenizer(normalized_text)
  tokens = filter(lambda t: t not in stopwords, tokens)
  tokens = filter(lambda t: len(t) > 1, tokens)
  tokens = list(tokens)

  stems = [stemmer(word) for word in tokens]

  result['original'] = text
  result['no_symbol'] = no_symbol_text
  result['normalized'] = normalized_text
  result['tokens'] = tokens
  result['stems'] = stems
  result['method'] = method_name

  return result




# Hazm
### https://github.com/roshan-research/hazm

In [None]:
!pip install hazm

In [None]:
from hazm import Normalizer, WordTokenizer, Stemmer
hzam_normalizer = Normalizer()
hazm_tokenizer = WordTokenizer()
hazm_stemmer = Stemmer()



In [None]:
result_h = pipeline(news[0]['NewsBody'], hzam_normalizer.normalize, hazm_tokenizer.tokenize, hazm_stemmer.stem, 'hazm')

# Parsvar
### https://github.com/ICTRC/Parsivar

In [None]:
!pip install parsivar

In [None]:
import parsivar
parsivar_normalizer = parsivar.Normalizer()
parsivar_tokenizer = parsivar.Tokenizer()
parsiavr_stemmer = parsivar.FindStems()


In [None]:
result_p = pipeline(news[0]['NewsBody'], parsivar_normalizer.normalize, parsivar_tokenizer.tokenize_words, parsiavr_stemmer.convert_to_stem, 'parsivar')

# Dadmatools
### https://github.com/Dadmatech/DadmaTools

In [None]:
!pip install dadmatools

In [None]:
from dadmatools.models.normalizer import Normalizer
import dadmatools.pipeline.language as language
def dadmatools_preprocess(text):
  no_symbol_text = remove_symbols_and_numbers(text)
  normalized_text = Normalizer(full_cleaning=True).normalize(no_symbol_text)
  token_pipe = 'tok'
  tokenizer = language.Pipeline(token_pipe)

  # you can see the pipeline with this code
  # print(tokenizer.analyze_pipes(pretty=True))

  # doc is an SpaCy object
  doc = tokenizer(normalized_text)
  tokens = language.to_json(token_pipe, doc)
  # print(tokens)

  lem_pipe = 'tok,lem'
  lemmetizer = language.Pipeline(lem_pipe)

  # you can see the pipeline with this code
  # print(lemmetizer.analyze_pipes(pretty=True))

  # doc is an SpaCy object
  doc = lemmetizer(normalized_text)
  stems = language.to_json(lem_pipe, doc)
  # print(stems)
  tokens = map(lambda word: word['text'], tokens[0])
  tokens = filter(lambda t: t not in stopwords, tokens)
  tokens = filter(lambda t: len(t) > 1, tokens)
  tokens = list(tokens)

  stems = map(lambda word: word['lemma'], stems[0])
  stems = filter(lambda t: t not in stopwords, stems)
  stems = filter(lambda t: len(t) > 1, stems)
  stems = list(stems)

  result = dict()
  result['original'] = text
  result['no_symbol'] = no_symbol_text
  result['normalized'] = normalized_text
  result['tokens'] = tokens
  result['stems'] = stems
  result['method'] = 'dadmatools'

  return result

In [None]:
result_d = dadmatools_preprocess(news[0]['NewsBody'])

In [None]:
pd.DataFrame(columns=('library','input', 'output'), data=[('hazm',news[0]['NewsBody'],result_h['normalized']),
                                                          ('parsvar',news[0]['NewsBody'],result_p['normalized']),
                                                          ('dadmatools',news[0]['NewsBody'],result_d['normalized'])])

In [None]:
pd.DataFrame(columns=('library','input', 'output'), data=[('hazm',news[0]['NewsBody'],result_h['tokens']),
                                                          ('parsvar',news[0]['NewsBody'],result_p['tokens']),
                                                          ('dadmatools',news[0]['NewsBody'],result_d['tokens'])])

In [None]:
pd.DataFrame(columns=('library','input', 'output'), data=[('hazm',news[0]['NewsBody'],result_h['stems']),
                                                          ('parsvar',news[0]['NewsBody'],result_p['stems']),
                                                          ('dadmatools',news[0]['NewsBody'],result_d['stems'])])

In [None]:
result_d

In [None]:
result_h

In [None]:
result_p