Some modifications and evaluation results

aboelhamd · May 13, 2019 · a9ee928 · a9ee928
1 parent 6ca7420
commit a9ee928
Show file tree

Hide file tree

Showing 12 changed files with 122 additions and 128 deletions.
diff --git a/choose-best-sents.py b/choose-best-sents.py
@@ -16,10 +16,10 @@
 
 with open(sys.argv[1]) as scoresFile, open(sys.argv[2]) as combFile: 
   for scores, sent in zip(scoresFile, combFile):
-    print(scores.strip())
+    #print(scores.strip())
     if (scores.strip()) :
       sents.append(sent)
-      scoresArr = list(map(int, scores.split()))
+      scoresArr = list(map(float, scores.split()))
       wer = scoresArr[0]
       per = scoresArr[1]
       werper = wer+per
@@ -41,9 +41,9 @@
       werspers.append(werper)
 
     else :
-      minwerFile.write(sents[minwerI]+"\n")
-      minperFile.write(sents[minperI]+"\n")
-      minwerperFile.write(sents[minwerperI]+"\n")
+      minwerFile.write(sents[minwerI])
+      minperFile.write(sents[minperI])
+      minwerperFile.write(sents[minwerperI])
 
       minwer, minper, minwerper, minwerI, minperI, minwerperI = 10000.,10000.,10000.,0,0,0
 

diff --git a/spa-eng evaluation/results-ambig-average.txt b/spa-eng evaluation/results-ambig-average.txt
@@ -0,0 +1,14 @@
+Statistics about input files
+-------------------------------------------------------
+Number of words in reference: 20914420
+Number of words in test: 23314483
+Number of unknown words (marked with a star) in test: 839003
+Percentage of unknown words: 3.599 %
+
+Results when removing unknown-word marks (stars)
+-------------------------------------------------------
+Edit distance: 17038189
+Word error rate (WER): 81.466 %
+Number of position-independent correct words: 10077193
+Position-independent word error rate (PER): 63.293 %
+
diff --git a/spa-eng evaluation/results-apertium.txt b/spa-eng evaluation/results-apertium.txt
@@ -0,0 +1,14 @@
+Statistics about input files
+-------------------------------------------------------
+Number of words in reference: 487335
+Number of words in test: 545016
+Number of unknown words (marked with a star) in test: 19246
+Percentage of unknown words: 3.531 %
+
+Results when removing unknown-word marks (stars)
+-------------------------------------------------------
+Edit distance: 395016
+Word error rate (WER): 81.056 %
+Number of position-independent correct words: 235838
+Position-independent word error rate (PER): 63.443 %
+
diff --git a/spa-eng evaluation/results-beam.txt b/spa-eng evaluation/results-beam.txt
@@ -0,0 +1,14 @@
+Statistics about input files
+-------------------------------------------------------
+Number of words in reference: 487335
+Number of words in test: 501391
+Number of unknown words (marked with a star) in test: 19246
+Percentage of unknown words: 3.839 %
+
+Results when removing unknown-word marks (stars)
+-------------------------------------------------------
+Edit distance: 380845
+Word error rate (WER): 78.149 %
+Number of position-independent correct words: 224934
+Position-independent word error rate (PER): 56.728 %
+
diff --git a/spa-eng evaluation/results-lm.txt b/spa-eng evaluation/results-lm.txt
@@ -0,0 +1,14 @@
+Statistics about input files
+-------------------------------------------------------
+Number of words in reference: 487335
+Number of words in test: 530546
+Number of unknown words (marked with a star) in test: 19246
+Percentage of unknown words: 3.628 %
+
+Results when removing unknown-word marks (stars)
+-------------------------------------------------------
+Edit distance: 389040
+Word error rate (WER): 79.830 %
+Number of position-independent correct words: 232384
+Position-independent word error rate (PER): 61.182 %
+
diff --git a/spa-eng evaluation/results-per.txt b/spa-eng evaluation/results-per.txt
@@ -0,0 +1,14 @@
+Statistics about input files
+-------------------------------------------------------
+Number of words in reference: 487335
+Number of words in test: 530466
+Number of unknown words (marked with a star) in test: 19246
+Percentage of unknown words: 3.628 %
+
+Results when removing unknown-word marks (stars)
+-------------------------------------------------------
+Edit distance: 389950
+Word error rate (WER): 80.017 %
+Number of position-independent correct words: 235288
+Position-independent word error rate (PER): 60.570 %
+
diff --git a/spa-eng evaluation/results-wer.txt b/spa-eng evaluation/results-wer.txt
@@ -0,0 +1,14 @@
+Statistics about input files
+-------------------------------------------------------
+Number of words in reference: 487335
+Number of words in test: 530224
+Number of unknown words (marked with a star) in test: 19246
+Percentage of unknown words: 3.630 %
+
+Results when removing unknown-word marks (stars)
+-------------------------------------------------------
+Edit distance: 382290
+Word error rate (WER): 78.445 %
+Number of position-independent correct words: 233448
+Position-independent word error rate (PER): 60.898 %
+
diff --git a/spa-eng evaluation/results-werper.txt b/spa-eng evaluation/results-werper.txt
@@ -0,0 +1,14 @@
+Statistics about input files
+-------------------------------------------------------
+Number of words in reference: 487335
+Number of words in test: 529447
+Number of unknown words (marked with a star) in test: 19246
+Percentage of unknown words: 3.635 %
+
+Results when removing unknown-word marks (stars)
+-------------------------------------------------------
+Edit distance: 383016
+Word error rate (WER): 78.594 %
+Number of position-independent correct words: 234864
+Position-independent word error rate (PER): 60.448 %
+
diff --git a/src/BeamSearch.cpp b/src/BeamSearch.cpp
@@ -70,7 +70,7 @@ main (int argc, char **argv)
       cout
 	  << "interInFilePath : Output file of this program which is the input for apertium interchunk."
 	  << endl;
-      cout << "modelsDest : Yasmet models destination." << endl;
+      cout << "modelsDest : Yasmet models merged file destination." << endl;
       cout << "beamSize : The size of beam in beam search algorithm." << endl;
       return -1;
     }
@@ -101,10 +101,11 @@ main (int argc, char **argv)
       stringstream buffer (k);
       buffer >> beam;
 
+//      unsigned i = 0;
       string tokenizedSentence;
       while (getline (lextorFile, tokenizedSentence))
 	{
-	  //	  cout << i << endl;
+//	  cout << i << endl;
 
 	  // spaces after each token
 	  vector<string> spaces;

diff --git a/src/BestLangMod.cpp b/src/BestLangMod.cpp
@@ -120,8 +120,8 @@ main (int argc, char **argv)
       map<string, string> vars = RuleParser::getVars (transfer);
       map<string, vector<string> > lists = RuleParser::getLists (transfer);
 
-      string tokenizedSentence;
       // unsigned i = 0;
+      string tokenizedSentence;
       while (getline (lextorFile, tokenizedSentence))
 	{
 	  // cout << i << endl;
@@ -186,6 +186,8 @@ main (int argc, char **argv)
 	      float weight = strtof (line.c_str (), NULL);
 	      normWeights.push_back (weight);
 	    }
+	  // beware of the newline
+	  getline (weightFile, line);
 
 	  // read transfer
 	  vector<string> normTransfers;
@@ -194,6 +196,8 @@ main (int argc, char **argv)
 	      getline (transferOutFile, line);
 	      normTransfers.push_back (line);
 	    }
+	  // beware of the newline
+	  getline (transferOutFile, line);
 
 	  // remove redundant outputs
 	  vector<string> outs;

diff --git a/src/CombAlign.cpp b/src/CombAlign.cpp
@@ -79,13 +79,13 @@ main (int argc, char **argv)
       cout << "referenceFilePath : Reference parallel target translation file path."
 	  << endl;
       cout << "newRefFilePath : New aligned reference file path." << endl;
-//      return -1;
+      return -1;
     }
 
   ifstream lextorFile (lextorFilePath.c_str ());
   ofstream chunkerFile (chunkerFilePath.c_str ());
-  ifstream referenceFile (referenceFilePath);
-  ofstream newRefFile (newRefFilePath);
+  ifstream referenceFile (referenceFilePath.c_str ());
+  ofstream newRefFile (newRefFilePath.c_str ());
   if (lextorFile.is_open () && chunkerFile.is_open () && referenceFile.is_open ()
       && newRefFile.is_open ())
     {
@@ -106,11 +106,11 @@ main (int argc, char **argv)
       map<string, string> vars = RuleParser::getVars (transfer);
       map<string, vector<string> > lists = RuleParser::getLists (transfer);
 
-      unsigned i = 0;
+//      unsigned i = 0;
       string tokenizedSentence, refSent;
       while (getline (lextorFile, tokenizedSentence) && getline (referenceFile, refSent))
 	{
-	  cout << i++ << endl;
+//	  cout << i++ << endl;
 
 	  // spaces after each token
 	  vector<string> spaces;
@@ -200,7 +200,7 @@ main (int argc, char **argv)
       chunkerFile.close ();
       referenceFile.close ();
       newRefFile.close ();
-      cout << "CombAlign finished!";
+//      cout << "CombAlign finished!";
     }
   else
     {

diff --git a/src/RulesApplier.cpp b/src/RulesApplier.cpp
@@ -75,15 +75,6 @@ main (int argc, char **argv)
 
   ifstream lextorFile (lextorFilePath.c_str ());
   ofstream interInFile (interInFilePath.c_str ());
-  ifstream refFile (
-      string ("/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test.txt").c_str ());
-  ofstream refInFile (
-      string ("/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test-mul.txt").c_str ());
-  ifstream errFile (
-      string (
-	  "/home/aboelhamd/Downloads/apertium-eval-translator-master/ambig_results.txt").c_str ());
-  ofstream bestInFile (
-      string ("/home/aboelhamd/eclipse-workspace/machinetranslation/best-chunker.txt").c_str ());
   if (lextorFile.is_open () && interInFile.is_open ())
     {
       // load transfer file in an xml document object
@@ -103,11 +94,11 @@ main (int argc, char **argv)
       map<string, string> vars = RuleParser::getVars (transfer);
       map<string, vector<string> > lists = RuleParser::getLists (transfer);
 
-      unsigned i = 0;
-      string tokenizedSentence, refSent;
-      while (getline (lextorFile, tokenizedSentence) && getline (refFile, refSent))
+//      unsigned i = 0;
+      string tokenizedSentence;
+      while (getline (lextorFile, tokenizedSentence))
 	{
-	  cout << i++ << endl;
+//	  cout << i++ << endl;
 
 	  // spaces after each token
 	  vector<string> spaces;
@@ -159,109 +150,12 @@ main (int argc, char **argv)
 	  RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs,
 				  spaces);
 
-//	  for (unsigned j = 0; j < tlTokens.size (); j++)
-//	    {
-//	      cout << tlTokens[j] << endl;
-//	      vector<pair<unsigned, unsigned> > rulees = tokenRules[j];
-//	      for (unsigned k = 0; k < rulees.size (); k++)
-//		{
-//		  cout << rulees[k].first << " , " << rulees[k].second << endl;
-//		}
-//	      cout << endl;
-//	    }
-//
-//	  for (unsigned j = 0; j < ambigInfo.size (); j++)
-//	    {
-//	      cout << "firTokId = " << ambigInfo[j]->firTokId << "; maxPat = "
-//		  << ambigInfo[j]->maxPat << endl;
-//	      vector<vector<RuleExecution::Node*> > combinations =
-//		  ambigInfo[j]->combinations;
-//	      cout << endl;
-//	      for (unsigned k = 0; k < combinations.size (); k++)
-//		{
-//		  vector<RuleExecution::Node*> nodes = combinations[k];
-//		  for (unsigned l = 1; l < nodes.size (); l++)
-//		    {
-//		      cout << "tok=" << nodes[l]->tokenId << "; rul=" << nodes[l]->ruleId
-//			  << "; pat=" << nodes[l]->patNum << " - ";
-//		    }
-//		  cout << endl;
-//		}
-//	      cout << endl;
-//	    }
-//
-//	  for (map<unsigned, map<unsigned, string> >::iterator it = ruleOutputs.begin ();
-//	      it != ruleOutputs.end (); it++)
-//	    {
-//	      cout << "ruleId=" << it->first << endl;
-//	      map<unsigned, string> outs = it->second;
-//
-//	      for (map<unsigned, string>::iterator it2 = outs.begin ();
-//		  it2 != outs.end (); it2++)
-//		{
-//		  cout << "tokId=" << it2->first << " , out = " << it2->second << endl;
-//		}
-//	      cout << endl;
-//	    }
-//	  cout << endl;
-//
-//	  for (unsigned j = 0; j < tlTokens.size (); j++)
-//	    {
-//	      vector<RuleExecution::Node*> nodes = nodesPool[j];
-//	      cout << "tokId = " << j << " : " << tlTokens[j] << endl;
-//	      for (unsigned k = 0; k < nodes.size (); k++)
-//		{
-//		  cout << "ruleId = " << nodes[k]->ruleId << "; patNum = "
-//		      << nodes[k]->patNum << endl;
-//		}
-//	      cout << endl;
-//	    }
-//
-//	  for (unsigned j = 0; j < combNodes.size (); j++)
-//	    {
-//	      vector<RuleExecution::Node*> nodes = combNodes[j];
-//	      for (unsigned k = 0; k < nodes.size (); k++)
-//		{
-//		  cout << "tok=" << nodes[k]->tokenId << "; rul=" << nodes[k]->ruleId
-//		      << "; pat=" << nodes[k]->patNum << " - ";
-//		}
-//	      cout << endl;
-//	    }
-
-//	  set<string> diffOuts (outs.begin (), outs.end ());
-//
-//	  // write the outs
-//	  for (set<string>::iterator it = diffOuts.begin (); it != diffOuts.end (); it++)
-//	    {
-//	      interInFile << *it << endl;
-//	      refInFile << refSent << endl;
-//	    }
-
-	  float min = 100000;
-	  int minInd = -1;
-	  string serr;
-	  float err;
-
 	  // write the outs
 	  for (unsigned j = 0; j < outs.size (); j++)
 	    {
-	      getline (errFile, serr);
-	      err = strtof (serr.c_str (), NULL);
-
-	      if (err < min)
-		{
-		  min = err;
-		  minInd = j;
-		}
-
 	      interInFile << outs[j] << endl;
-	      refInFile << refSent << endl;
 	    }
-//	  cout << minInd << endl;
-	  bestInFile << outs[minInd] << endl;
-
 	  interInFile << endl;
-	  refInFile << endl;
 
 	  // delete AmbigInfo pointers
 	  for (unsigned j = 0; j < ambigInfo.size (); j++)
@@ -289,10 +183,7 @@ main (int argc, char **argv)
 
       lextorFile.close ();
       interInFile.close ();
-      refFile.close ();
-      refInFile.close ();
-      bestInFile.close ();
-      cout << "RulesApplier finished!";
+//      cout << "RulesApplier finished!";
     }
   else
     {