Skip to content

Commit

Permalink
adding code, datasets, and other resources
Browse files Browse the repository at this point in the history
  • Loading branch information
ObeidaElJundi committed Jul 31, 2019
1 parent ef99246 commit bb4e9e9
Show file tree
Hide file tree
Showing 18 changed files with 153,769 additions and 0 deletions.
573 changes: 573 additions & 0 deletions arabic-wiki-stuff.ipynb

Large diffs are not rendered by default.

1,138 changes: 1,138 additions & 0 deletions build_arabic_language_model.ipynb

Large diffs are not rendered by default.

Binary file added data/AJGT.xlsx
Binary file not shown.
1,801 changes: 1,801 additions & 0 deletions data/AJGT_final.csv

Large diffs are not rendered by default.

1,331 changes: 1,331 additions & 0 deletions data/ASTD-B.csv

Large diffs are not rendered by default.

1,331 changes: 1,331 additions & 0 deletions data/ASTD-B_final.csv

Large diffs are not rendered by default.

10,007 changes: 10,007 additions & 0 deletions data/ASTD.csv

Large diffs are not rendered by default.

10,007 changes: 10,007 additions & 0 deletions data/ASTD_final.csv

Large diffs are not rendered by default.

4,001 changes: 4,001 additions & 0 deletions data/ArSenTD-LEV-final.csv

Large diffs are not rendered by default.

4,001 changes: 4,001 additions & 0 deletions data/ArSenTD-LEV.tsv

Large diffs are not rendered by default.

3,543 changes: 3,543 additions & 0 deletions data/ArTwitter_final.csv

Large diffs are not rendered by default.

105,699 changes: 105,699 additions & 0 deletions data/HARD-final.csv

Large diffs are not rendered by default.

Binary file added data/Hard-balanced-reviews.txt
Binary file not shown.
3,623 changes: 3,623 additions & 0 deletions data/artwitter.csv

Large diffs are not rendered by default.

3,329 changes: 3,329 additions & 0 deletions fine_tune_LM.ipynb

Large diffs are not rendered by default.

3,296 changes: 3,296 additions & 0 deletions resources/WikiExtractor.py

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions resources/readme.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
How many lines in a particular file: (Windows Powershell)
Get-content .\samples\output\all_arabic_wiki_2019.txt.MyD3.tok | Measure-Object –Line

Copy first 100 lines from one file to another: (Windows Powershell)
Get-Content -First 100 .\samples\output\all_arabic_wiki_2019.txt.MyD3.tok -Encoding UTF8 | Out-File .\samples\output\100sample.txt -Encoding UTF8

Run MADAMIRA: (make sure you are using JDK 64 not 32)
C:\Users\oae15\Downloads\JDK64\bin\java.exe -Xmx3000m -Xms3000m -XX:NewRatio=3 -jar MADAMIRA-release-20170403-2.1.jar -rawinput "C:\Python\Python36\coding4fun\Notebooks\fastai\for research\Ar_LM\data\ASTD_text_only.txt" -rawoutdir .\samples\output -rawconfig .\samples\sampleConfigFile.xml -msaonly
81 changes: 81 additions & 0 deletions resources/sampleConfigFile.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
<?xml version="1.0" encoding="utf-8"?>

<!--
~ Copyright (c) 2013. The Trustees of Columbia University in the City of New York.
~ The copyright owner has no objection to the reproduction of this work by anyone for
~ non-commercial use, but otherwise reserves all rights whatsoever. For avoidance of
~ doubt, this work may not be reproduced, or modified, in whole or in part, for commercial
~ use without the prior written consent of the copyright owner.
-->

<madamira_configuration xmlns="urn:edu.columbia.ccls.madamira.configuration:0.1">
<preprocessing sentence_ids="false" separate_punct="true" input_encoding="UTF8"/>
<overall_vars output_encoding="UTF8" dialect="MSA" output_analyses="TOP" morph_backoff="NONE"/>
<requested_output>
<req_variable name="PREPROCESSED" value="true" />

<req_variable name="STEM" value="false" />
<req_variable name="GLOSS" value="false" />
<req_variable name="LEMMA" value="false" />
<req_variable name="DIAC" value="false" />
<req_variable name="ASP" value="false" />
<req_variable name="CAS" value="false" />
<req_variable name="ENC0" value="true" />
<req_variable name="ENC1" value="false" />
<req_variable name="ENC2" value="false" />
<req_variable name="GEN" value="false" />
<req_variable name="MOD" value="false" />
<req_variable name="NUM" value="false" />
<req_variable name="PER" value="false" />
<req_variable name="POS" value="false" />
<req_variable name="PRC0" value="true" />
<req_variable name="PRC1" value="true" />
<req_variable name="PRC2" value="true" />
<req_variable name="PRC3" value="true" />
<req_variable name="STT" value="false" />
<req_variable name="VOX" value="false" />
<req_variable name="BW" value="false" />
<req_variable name="SOURCE" value="false" />
<req_variable name="LENGTH" value="true" />
<req_variable name="OFFSET" value="true" />
<req_variable name="NER" value="false" />
<req_variable name="BPC" value="false" />
</requested_output>
<tokenization>
<!-- <scheme alias="ATB" />
<scheme alias="ATB4MT" /> -->
<scheme alias="D3_BWPOS" /> <!-- Required for NER -->
<scheme alias="MyD3">
<!-- Same as D3 -->
<scheme_override alias="MyD3"
form_delimiter="\u00B7"
include_non_arabic="false"
mark_no_analysis="false"
token_delimiter=" "
tokenize_from_BW="false">
<split_term_spec term="PRC3"/>
<split_term_spec term="PRC2"/>
<split_term_spec term="PART"/>
<split_term_spec term="PRC0"/>
<split_term_spec term="REST"/>
<split_term_spec term="ENC0"/>
<token_form_spec enclitic_mark="+"
proclitic_mark="+"
token_form_base="WORD"
transliteration="UTF8">
<normalization type="ALEF"/>
<normalization type="HAMZA"/>
<normalization type="YAA"/>
<normalization type="DIAC"/>
<normalization type="DIGIT"/>
<normalization type="TEHMARBUTA"/>
<!-- <normalization type="LEFTPAREN"/>
<normalization type="RIGHTPAREN"/> -->
</token_form_spec>
</scheme_override>
</scheme>
</tokenization>
</madamira_configuration>



0 comments on commit bb4e9e9

Please sign in to comment.