-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
- Loading branch information
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
How many lines in a particular file: (Windows Powershell) | ||
Get-content .\samples\output\all_arabic_wiki_2019.txt.MyD3.tok | Measure-Object –Line | ||
|
||
Copy first 100 lines from one file to another: (Windows Powershell) | ||
Get-Content -First 100 .\samples\output\all_arabic_wiki_2019.txt.MyD3.tok -Encoding UTF8 | Out-File .\samples\output\100sample.txt -Encoding UTF8 | ||
|
||
Run MADAMIRA: (make sure you are using JDK 64 not 32) | ||
C:\Users\oae15\Downloads\JDK64\bin\java.exe -Xmx3000m -Xms3000m -XX:NewRatio=3 -jar MADAMIRA-release-20170403-2.1.jar -rawinput "C:\Python\Python36\coding4fun\Notebooks\fastai\for research\Ar_LM\data\ASTD_text_only.txt" -rawoutdir .\samples\output -rawconfig .\samples\sampleConfigFile.xml -msaonly |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
<?xml version="1.0" encoding="utf-8"?> | ||
|
||
<!-- | ||
~ Copyright (c) 2013. The Trustees of Columbia University in the City of New York. | ||
~ The copyright owner has no objection to the reproduction of this work by anyone for | ||
~ non-commercial use, but otherwise reserves all rights whatsoever. For avoidance of | ||
~ doubt, this work may not be reproduced, or modified, in whole or in part, for commercial | ||
~ use without the prior written consent of the copyright owner. | ||
--> | ||
|
||
<madamira_configuration xmlns="urn:edu.columbia.ccls.madamira.configuration:0.1"> | ||
<preprocessing sentence_ids="false" separate_punct="true" input_encoding="UTF8"/> | ||
<overall_vars output_encoding="UTF8" dialect="MSA" output_analyses="TOP" morph_backoff="NONE"/> | ||
<requested_output> | ||
<req_variable name="PREPROCESSED" value="true" /> | ||
|
||
<req_variable name="STEM" value="false" /> | ||
<req_variable name="GLOSS" value="false" /> | ||
<req_variable name="LEMMA" value="false" /> | ||
<req_variable name="DIAC" value="false" /> | ||
<req_variable name="ASP" value="false" /> | ||
<req_variable name="CAS" value="false" /> | ||
<req_variable name="ENC0" value="true" /> | ||
<req_variable name="ENC1" value="false" /> | ||
<req_variable name="ENC2" value="false" /> | ||
<req_variable name="GEN" value="false" /> | ||
<req_variable name="MOD" value="false" /> | ||
<req_variable name="NUM" value="false" /> | ||
<req_variable name="PER" value="false" /> | ||
<req_variable name="POS" value="false" /> | ||
<req_variable name="PRC0" value="true" /> | ||
<req_variable name="PRC1" value="true" /> | ||
<req_variable name="PRC2" value="true" /> | ||
<req_variable name="PRC3" value="true" /> | ||
<req_variable name="STT" value="false" /> | ||
<req_variable name="VOX" value="false" /> | ||
<req_variable name="BW" value="false" /> | ||
<req_variable name="SOURCE" value="false" /> | ||
<req_variable name="LENGTH" value="true" /> | ||
<req_variable name="OFFSET" value="true" /> | ||
<req_variable name="NER" value="false" /> | ||
<req_variable name="BPC" value="false" /> | ||
</requested_output> | ||
<tokenization> | ||
<!-- <scheme alias="ATB" /> | ||
<scheme alias="ATB4MT" /> --> | ||
<scheme alias="D3_BWPOS" /> <!-- Required for NER --> | ||
<scheme alias="MyD3"> | ||
<!-- Same as D3 --> | ||
<scheme_override alias="MyD3" | ||
form_delimiter="\u00B7" | ||
include_non_arabic="false" | ||
mark_no_analysis="false" | ||
token_delimiter=" " | ||
tokenize_from_BW="false"> | ||
<split_term_spec term="PRC3"/> | ||
<split_term_spec term="PRC2"/> | ||
<split_term_spec term="PART"/> | ||
<split_term_spec term="PRC0"/> | ||
<split_term_spec term="REST"/> | ||
<split_term_spec term="ENC0"/> | ||
<token_form_spec enclitic_mark="+" | ||
proclitic_mark="+" | ||
token_form_base="WORD" | ||
transliteration="UTF8"> | ||
<normalization type="ALEF"/> | ||
<normalization type="HAMZA"/> | ||
<normalization type="YAA"/> | ||
<normalization type="DIAC"/> | ||
<normalization type="DIGIT"/> | ||
<normalization type="TEHMARBUTA"/> | ||
<!-- <normalization type="LEFTPAREN"/> | ||
<normalization type="RIGHTPAREN"/> --> | ||
</token_form_spec> | ||
</scheme_override> | ||
</scheme> | ||
</tokenization> | ||
</madamira_configuration> | ||
|
||
|
||
|