adding code, datasets, and other resources

aub-mind · Jul 31, 2019 · bb4e9e9 · bb4e9e9
1 parent ef99246
commit bb4e9e9
Show file tree

Hide file tree

Showing 18 changed files with 153,769 additions and 0 deletions.
diff --git a/arabic-wiki-stuff.ipynb b/arabic-wiki-stuff.ipynb
diff --git a/build_arabic_language_model.ipynb b/build_arabic_language_model.ipynb
diff --git a/data/AJGT.xlsx b/data/AJGT.xlsx
diff --git a/data/AJGT_final.csv b/data/AJGT_final.csv
diff --git a/data/ASTD-B.csv b/data/ASTD-B.csv
diff --git a/data/ASTD-B_final.csv b/data/ASTD-B_final.csv
diff --git a/data/ASTD.csv b/data/ASTD.csv
diff --git a/data/ASTD_final.csv b/data/ASTD_final.csv
diff --git a/data/ArSenTD-LEV-final.csv b/data/ArSenTD-LEV-final.csv
diff --git a/data/ArSenTD-LEV.tsv b/data/ArSenTD-LEV.tsv
diff --git a/data/ArTwitter_final.csv b/data/ArTwitter_final.csv
diff --git a/data/HARD-final.csv b/data/HARD-final.csv
diff --git a/data/Hard-balanced-reviews.txt b/data/Hard-balanced-reviews.txt
diff --git a/data/artwitter.csv b/data/artwitter.csv
diff --git a/fine_tune_LM.ipynb b/fine_tune_LM.ipynb
diff --git a/resources/WikiExtractor.py b/resources/WikiExtractor.py
diff --git a/resources/readme.txt b/resources/readme.txt
@@ -0,0 +1,8 @@
+How many lines in a particular file: (Windows Powershell)
+Get-content .\samples\output\all_arabic_wiki_2019.txt.MyD3.tok | Measure-Object –Line
+
+Copy first 100 lines from one file to another: (Windows Powershell)
+Get-Content -First 100 .\samples\output\all_arabic_wiki_2019.txt.MyD3.tok -Encoding UTF8 | Out-File .\samples\output\100sample.txt -Encoding UTF8
+
+Run MADAMIRA: (make sure you are using JDK 64 not 32)
+C:\Users\oae15\Downloads\JDK64\bin\java.exe -Xmx3000m -Xms3000m -XX:NewRatio=3 -jar MADAMIRA-release-20170403-2.1.jar -rawinput "C:\Python\Python36\coding4fun\Notebooks\fastai\for research\Ar_LM\data\ASTD_text_only.txt" -rawoutdir .\samples\output -rawconfig .\samples\sampleConfigFile.xml -msaonly
diff --git a/resources/sampleConfigFile.xml b/resources/sampleConfigFile.xml
@@ -0,0 +1,81 @@
+<?xml version="1.0" encoding="utf-8"?>
+
+<!--
+  ~ Copyright (c) 2013. The Trustees of Columbia University in the City of New York.
+  ~ The copyright owner has no objection to the reproduction of this work by anyone for
+  ~ non-commercial use, but otherwise reserves all rights whatsoever.  For avoidance of
+  ~ doubt, this work may not be reproduced, or modified, in whole or in part, for commercial
+  ~ use without the prior written consent of the copyright owner.
+  -->
+
+<madamira_configuration xmlns="urn:edu.columbia.ccls.madamira.configuration:0.1">
+    <preprocessing sentence_ids="false" separate_punct="true" input_encoding="UTF8"/>
+    <overall_vars output_encoding="UTF8" dialect="MSA" output_analyses="TOP" morph_backoff="NONE"/>
+    <requested_output>
+        <req_variable name="PREPROCESSED" value="true" />
+
+        <req_variable name="STEM" value="false" />
+        <req_variable name="GLOSS" value="false" />
+        <req_variable name="LEMMA" value="false" />
+        <req_variable name="DIAC" value="false" />
+        <req_variable name="ASP" value="false" />
+        <req_variable name="CAS" value="false" />
+        <req_variable name="ENC0" value="true" />
+        <req_variable name="ENC1" value="false" />
+        <req_variable name="ENC2" value="false" />
+        <req_variable name="GEN" value="false" />
+        <req_variable name="MOD" value="false" />
+        <req_variable name="NUM" value="false" />
+        <req_variable name="PER" value="false" />
+        <req_variable name="POS" value="false" />
+        <req_variable name="PRC0" value="true" />
+        <req_variable name="PRC1" value="true" />
+        <req_variable name="PRC2" value="true" />
+        <req_variable name="PRC3" value="true" />
+        <req_variable name="STT" value="false" />
+        <req_variable name="VOX" value="false" />
+        <req_variable name="BW" value="false" />
+        <req_variable name="SOURCE" value="false" />
+		<req_variable name="LENGTH" value="true" />
+		<req_variable name="OFFSET" value="true" />
+		<req_variable name="NER" value="false" />
+		<req_variable name="BPC" value="false" />
+    </requested_output>
+    <tokenization>
+        <!-- <scheme alias="ATB" />
+        <scheme alias="ATB4MT" /> -->
+		<scheme alias="D3_BWPOS" /> <!-- Required for NER -->
+        <scheme alias="MyD3">
+            <!-- Same as D3 -->
+            <scheme_override alias="MyD3"
+                             form_delimiter="\u00B7"
+                             include_non_arabic="false"
+                             mark_no_analysis="false"
+                             token_delimiter=" "
+                             tokenize_from_BW="false">
+                <split_term_spec term="PRC3"/>
+                <split_term_spec term="PRC2"/>
+                <split_term_spec term="PART"/>
+                <split_term_spec term="PRC0"/>
+                <split_term_spec term="REST"/>
+                <split_term_spec term="ENC0"/>
+                <token_form_spec enclitic_mark="+"
+                                 proclitic_mark="+"
+                                 token_form_base="WORD"
+                                 transliteration="UTF8">
+                    <normalization type="ALEF"/>
+                    <normalization type="HAMZA"/>
+                    <normalization type="YAA"/>
+                    <normalization type="DIAC"/>
+                    <normalization type="DIGIT"/>
+                    <normalization type="TEHMARBUTA"/>
+                    <!-- <normalization type="LEFTPAREN"/>
+                    <normalization type="RIGHTPAREN"/> -->
+                </token_form_spec>
+            </scheme_override>
+        </scheme>
+    </tokenization>
+</madamira_configuration>
+
+
+