# Setup Stanza

In [33]:
import stanza
import spacy_stanza

stanza.install_corenlp()



In [34]:
stanza.download('fa')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.3.0.json: 142kB [00:00, 82.2MB/s]                    
[2022-04-12 15:26:33,070 INFO] Downloading default packages for language: fa (Persian)...
[2022-04-12 15:26:35,642 INFO] File exists: /home/ahura/stanza_resources/fa/default.zip.
[2022-04-12 15:26:39,851 INFO] Finished downloading models and saved to /home/ahura/stanza_resources.


In [35]:
stanza_nlp = spacy_stanza.load_pipeline("fa")

[2022-04-12 15:26:39,999 INFO] Loading these models for language: fa (Persian):
| Processor | Package |
-----------------------
| tokenize  | perdt   |
| mwt       | perdt   |
| pos       | perdt   |
| lemma     | perdt   |
| depparse  | perdt   |

[2022-04-12 15:26:40,000 INFO] Use device: gpu
[2022-04-12 15:26:40,001 INFO] Loading: tokenize
[2022-04-12 15:26:40,012 INFO] Loading: mwt
[2022-04-12 15:26:40,021 INFO] Loading: pos
[2022-04-12 15:26:40,218 INFO] Loading: lemma
[2022-04-12 15:26:40,281 INFO] Loading: depparse
[2022-04-12 15:26:40,668 INFO] Done loading processors!


# Setup DadmaTools

In [36]:
import dadmatools.pipeline.language as language

# here lemmatizer and pos tagger will be loaded
# as tokenizer is the default tool, it will be loaded as well even without calling
pips = 'ner, pos, dep, cons, chunk, lem, tok' 
dadma_nlp = language.Pipeline(pips)

# you can see the pipeline with this code
print(dadma_nlp.analyze_pipes(pretty=True))

# doc is an SpaCy object
# doc = dadma_nlp('از قصهٔ کودکیشان که می‌گفت، گاهی حرص می‌خورد!')

Model fa_tokenizer exists in /home/ahura/.pernlp/fa_tokenizer.pt
Model fa_mwt exists in /home/ahura/.pernlp/fa_mwt.pt
Model fa_lemmatizer exists in /home/ahura/.pernlp/fa_lemmatizer.pt
Model parsbert exists in /home/ahura/.pernlp/parsbert.tar.gz
Model dependencyparser exists in /home/ahura/.pernlp/dependencyparser.pt
2022-04-12 15:26:44,234 loading file /home/ahura/anaconda3/envs/DataEnvPIP/lib/python3.8/site-packages/dadmatools/saved_models/dependencyparser/dependencyparser.pt
Model parsbert exists in /home/ahura/.pernlp/parsbert.tar.gz
Model postagger exists in /home/ahura/.pernlp/postagger.pt
2022-04-12 15:27:00,671 loading file /home/ahura/anaconda3/envs/DataEnvPIP/lib/python3.8/site-packages/dadmatools/saved_models/postagger/postagger.pt
Model fa_constituency exists in /home/ahura/.pernlp/fa_constituency.pt
Model ner exists in /home/ahura/.pernlp/ner.tar.gz
[1m

#   Component            Assigns       Requires   Scores   Retokenizes
-   ------------------   -----------   -------- 

### Dadma Normalizer

In [37]:
from dadmatools.models.normalizer import Normalizer

normalizer = Normalizer(
    full_cleaning=False,
    unify_chars=True,
    refine_punc_spacing=True,
    remove_extra_space=True,
    remove_puncs=False,
    remove_html=False,
    remove_stop_word=False,
    replace_email_with="<EMAIL>",
    replace_number_with=None,
    replace_url_with="<URL",
    replace_mobile_number_with="<MOBILE_NUMBER>",
    replace_emoji_with="<EMOJI>",
    replace_home_number_with="<HOME_NUMBER>"
)

# Test Libs

### Sample Texts

In [67]:
texts = [
    # 'روز چهارشنبه یک دفعه برای خودشون افشا زدن.'
    # "ارزش سهام شرکت نفت کاهش یافت",
    # "سهام وغذیر و خزر کاهش یافت.",
    # "قرارداد با آمریکا باعث افت قیمت سهم وغدیر شد",
    # "ریزش بازار به دلیل حمله‌ی روسیه است.",
    # "فک کنم یه اصلاح قیمتی و کمی ریزش داشته باشیم.",
    # "یک نکته‌ی تکنیکالی هم در صورت دستکاری نشدن اضافه کنم، کندلی که روز سه شنبه‌ی گذشته ثبت کرد کامل است",
    
    # "روز چهارشنبه یه دفعه برای خودشون افشا زدن",
    # "رشد قیمت‌ها باعث ایجاد صف خرید در سهم پرشیا شد",
    # "شاخص به ۲ میلیون می‌رسه",
    # "آمریکا باعث ریزش بازار شد",
    # "آمریکا موجب ریزش بازار شد",
    # "آمریکا دلیل ریزش بازار شد",
    # "کاهش قیمت سهم عجیب بود",
    # "قیمت زیاد شد",
    # "قیمت زیاد است",
    # "به کتابخانه رفتم.",
    # "به کتابخانه رفت.",
    # "پول در جیب من است.",
    # "سهم قیمتش پایین است",
    "حضور تو موجب خوشحالی من در هوای بارانی است",
]

### Stanza

In [39]:
# from spacy import displacy

# for index, text in enumerate(texts):
#     doc = stanza_nlp(text)
#     print(f'sentence {index + 1}: ...')
#     for token in doc:
#         print(f'word: {token.text:12}, pos: {token.pos_:10}, tag: {token.tag_:10}, dep: {token.dep_:15}')
#         print("\n")
        
#     displacy.render(list(doc.sents), style="dep")

In [68]:
from spacy import displacy

for index, text in enumerate(texts):
    text = normalizer.normalize(text)
    try:
        
        doc = dadma_nlp(text)
        
        # print(f'sentence {index + 1}: ...')
        # for token in doc:
        #     print(f'word: {token.text:12}, pos: {token.pos_:10}, tag: {token.tag_:10}, dep: {token.dep_:15}')
        #     print("\n")
        
        sentences = doc._.sentences
        
        for sentence in sentences:
            sentence_text = sentence.text
            for token in sentence:
                token_text = token.text
                lemma = token.lemma_ ## this has value only if lem is called
                pos_tag = token.pos_ ## this has value only if pos is called
                dep = token.dep_ ## this has value only if dep is called
                dep_arc = token._.dep_arc ## this has value only if dep is called
                print(token_text, pos_tag, dep, dep_arc)
                if token.pos_ == "AUX":
                    token.pos_ = "VERB"
                
        sent_constituency = doc._.constituency ## this has value only if cons is called
        sent_chunks = doc._.chunks ## this has value only if cons is called
        # ners = doc._.ners ## this has value only if ner is called
        # print(sent_constituency)
        # print(sent_chunks)
        
        print("\n\n\n")
        
        displacy.render(doc, style="dep")
        
    except Exception:
        
        print(f"ERRR {text}")
    
        

[2022-04-12 15:57:59,210 INFO] [Ensembling dict with seq2seq lemmatizer...]


حضور NOUN nsubj 3
تو PRON nmod 1
موجب ADJ root 0
خوشحالی NOUN obl:arg 3
من PRON nmod 4
در ADP case 7
هوای NOUN nmod 4
بارانی ADJ amod 7
است AUX cop 3






## [Pattern Matching](https://spacy.io/usage/spacy-101#architecture-matchers)

