Navigation Menu

Skip to content

Commit

Permalink
Some modifications and evaluation results
Browse files Browse the repository at this point in the history
  • Loading branch information
aboelhamd committed May 13, 2019
1 parent 6ca7420 commit a9ee928
Show file tree
Hide file tree
Showing 12 changed files with 122 additions and 128 deletions.
10 changes: 5 additions & 5 deletions choose-best-sents.py
Expand Up @@ -16,10 +16,10 @@

with open(sys.argv[1]) as scoresFile, open(sys.argv[2]) as combFile:
for scores, sent in zip(scoresFile, combFile):
print(scores.strip())
#print(scores.strip())
if (scores.strip()) :
sents.append(sent)
scoresArr = list(map(int, scores.split()))
scoresArr = list(map(float, scores.split()))
wer = scoresArr[0]
per = scoresArr[1]
werper = wer+per
Expand All @@ -41,9 +41,9 @@
werspers.append(werper)

else :
minwerFile.write(sents[minwerI]+"\n")
minperFile.write(sents[minperI]+"\n")
minwerperFile.write(sents[minwerperI]+"\n")
minwerFile.write(sents[minwerI])
minperFile.write(sents[minperI])
minwerperFile.write(sents[minwerperI])

minwer, minper, minwerper, minwerI, minperI, minwerperI = 10000.,10000.,10000.,0,0,0

Expand Down
14 changes: 14 additions & 0 deletions spa-eng evaluation/results-ambig-average.txt
@@ -0,0 +1,14 @@
Statistics about input files
-------------------------------------------------------
Number of words in reference: 20914420
Number of words in test: 23314483
Number of unknown words (marked with a star) in test: 839003
Percentage of unknown words: 3.599 %

Results when removing unknown-word marks (stars)
-------------------------------------------------------
Edit distance: 17038189
Word error rate (WER): 81.466 %
Number of position-independent correct words: 10077193
Position-independent word error rate (PER): 63.293 %

14 changes: 14 additions & 0 deletions spa-eng evaluation/results-apertium.txt
@@ -0,0 +1,14 @@
Statistics about input files
-------------------------------------------------------
Number of words in reference: 487335
Number of words in test: 545016
Number of unknown words (marked with a star) in test: 19246
Percentage of unknown words: 3.531 %

Results when removing unknown-word marks (stars)
-------------------------------------------------------
Edit distance: 395016
Word error rate (WER): 81.056 %
Number of position-independent correct words: 235838
Position-independent word error rate (PER): 63.443 %

14 changes: 14 additions & 0 deletions spa-eng evaluation/results-beam.txt
@@ -0,0 +1,14 @@
Statistics about input files
-------------------------------------------------------
Number of words in reference: 487335
Number of words in test: 501391
Number of unknown words (marked with a star) in test: 19246
Percentage of unknown words: 3.839 %

Results when removing unknown-word marks (stars)
-------------------------------------------------------
Edit distance: 380845
Word error rate (WER): 78.149 %
Number of position-independent correct words: 224934
Position-independent word error rate (PER): 56.728 %

14 changes: 14 additions & 0 deletions spa-eng evaluation/results-lm.txt
@@ -0,0 +1,14 @@
Statistics about input files
-------------------------------------------------------
Number of words in reference: 487335
Number of words in test: 530546
Number of unknown words (marked with a star) in test: 19246
Percentage of unknown words: 3.628 %

Results when removing unknown-word marks (stars)
-------------------------------------------------------
Edit distance: 389040
Word error rate (WER): 79.830 %
Number of position-independent correct words: 232384
Position-independent word error rate (PER): 61.182 %

14 changes: 14 additions & 0 deletions spa-eng evaluation/results-per.txt
@@ -0,0 +1,14 @@
Statistics about input files
-------------------------------------------------------
Number of words in reference: 487335
Number of words in test: 530466
Number of unknown words (marked with a star) in test: 19246
Percentage of unknown words: 3.628 %

Results when removing unknown-word marks (stars)
-------------------------------------------------------
Edit distance: 389950
Word error rate (WER): 80.017 %
Number of position-independent correct words: 235288
Position-independent word error rate (PER): 60.570 %

14 changes: 14 additions & 0 deletions spa-eng evaluation/results-wer.txt
@@ -0,0 +1,14 @@
Statistics about input files
-------------------------------------------------------
Number of words in reference: 487335
Number of words in test: 530224
Number of unknown words (marked with a star) in test: 19246
Percentage of unknown words: 3.630 %

Results when removing unknown-word marks (stars)
-------------------------------------------------------
Edit distance: 382290
Word error rate (WER): 78.445 %
Number of position-independent correct words: 233448
Position-independent word error rate (PER): 60.898 %

14 changes: 14 additions & 0 deletions spa-eng evaluation/results-werper.txt
@@ -0,0 +1,14 @@
Statistics about input files
-------------------------------------------------------
Number of words in reference: 487335
Number of words in test: 529447
Number of unknown words (marked with a star) in test: 19246
Percentage of unknown words: 3.635 %

Results when removing unknown-word marks (stars)
-------------------------------------------------------
Edit distance: 383016
Word error rate (WER): 78.594 %
Number of position-independent correct words: 234864
Position-independent word error rate (PER): 60.448 %

5 changes: 3 additions & 2 deletions src/BeamSearch.cpp
Expand Up @@ -70,7 +70,7 @@ main (int argc, char **argv)
cout
<< "interInFilePath : Output file of this program which is the input for apertium interchunk."
<< endl;
cout << "modelsDest : Yasmet models destination." << endl;
cout << "modelsDest : Yasmet models merged file destination." << endl;
cout << "beamSize : The size of beam in beam search algorithm." << endl;
return -1;
}
Expand Down Expand Up @@ -101,10 +101,11 @@ main (int argc, char **argv)
stringstream buffer (k);
buffer >> beam;

// unsigned i = 0;
string tokenizedSentence;
while (getline (lextorFile, tokenizedSentence))
{
// cout << i << endl;
// cout << i << endl;

// spaces after each token
vector<string> spaces;
Expand Down
6 changes: 5 additions & 1 deletion src/BestLangMod.cpp
Expand Up @@ -120,8 +120,8 @@ main (int argc, char **argv)
map<string, string> vars = RuleParser::getVars (transfer);
map<string, vector<string> > lists = RuleParser::getLists (transfer);

string tokenizedSentence;
// unsigned i = 0;
string tokenizedSentence;
while (getline (lextorFile, tokenizedSentence))
{
// cout << i << endl;
Expand Down Expand Up @@ -186,6 +186,8 @@ main (int argc, char **argv)
float weight = strtof (line.c_str (), NULL);
normWeights.push_back (weight);
}
// beware of the newline
getline (weightFile, line);

// read transfer
vector<string> normTransfers;
Expand All @@ -194,6 +196,8 @@ main (int argc, char **argv)
getline (transferOutFile, line);
normTransfers.push_back (line);
}
// beware of the newline
getline (transferOutFile, line);

// remove redundant outputs
vector<string> outs;
Expand Down
12 changes: 6 additions & 6 deletions src/CombAlign.cpp
Expand Up @@ -79,13 +79,13 @@ main (int argc, char **argv)
cout << "referenceFilePath : Reference parallel target translation file path."
<< endl;
cout << "newRefFilePath : New aligned reference file path." << endl;
// return -1;
return -1;
}

ifstream lextorFile (lextorFilePath.c_str ());
ofstream chunkerFile (chunkerFilePath.c_str ());
ifstream referenceFile (referenceFilePath);
ofstream newRefFile (newRefFilePath);
ifstream referenceFile (referenceFilePath.c_str ());
ofstream newRefFile (newRefFilePath.c_str ());
if (lextorFile.is_open () && chunkerFile.is_open () && referenceFile.is_open ()
&& newRefFile.is_open ())
{
Expand All @@ -106,11 +106,11 @@ main (int argc, char **argv)
map<string, string> vars = RuleParser::getVars (transfer);
map<string, vector<string> > lists = RuleParser::getLists (transfer);

unsigned i = 0;
// unsigned i = 0;
string tokenizedSentence, refSent;
while (getline (lextorFile, tokenizedSentence) && getline (referenceFile, refSent))
{
cout << i++ << endl;
// cout << i++ << endl;

// spaces after each token
vector<string> spaces;
Expand Down Expand Up @@ -200,7 +200,7 @@ main (int argc, char **argv)
chunkerFile.close ();
referenceFile.close ();
newRefFile.close ();
cout << "CombAlign finished!";
// cout << "CombAlign finished!";
}
else
{
Expand Down
119 changes: 5 additions & 114 deletions src/RulesApplier.cpp
Expand Up @@ -75,15 +75,6 @@ main (int argc, char **argv)

ifstream lextorFile (lextorFilePath.c_str ());
ofstream interInFile (interInFilePath.c_str ());
ifstream refFile (
string ("/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test.txt").c_str ());
ofstream refInFile (
string ("/home/aboelhamd/eclipse-workspace/machinetranslation/tgt-test-mul.txt").c_str ());
ifstream errFile (
string (
"/home/aboelhamd/Downloads/apertium-eval-translator-master/ambig_results.txt").c_str ());
ofstream bestInFile (
string ("/home/aboelhamd/eclipse-workspace/machinetranslation/best-chunker.txt").c_str ());
if (lextorFile.is_open () && interInFile.is_open ())
{
// load transfer file in an xml document object
Expand All @@ -103,11 +94,11 @@ main (int argc, char **argv)
map<string, string> vars = RuleParser::getVars (transfer);
map<string, vector<string> > lists = RuleParser::getLists (transfer);

unsigned i = 0;
string tokenizedSentence, refSent;
while (getline (lextorFile, tokenizedSentence) && getline (refFile, refSent))
// unsigned i = 0;
string tokenizedSentence;
while (getline (lextorFile, tokenizedSentence))
{
cout << i++ << endl;
// cout << i++ << endl;

// spaces after each token
vector<string> spaces;
Expand Down Expand Up @@ -159,109 +150,12 @@ main (int argc, char **argv)
RuleExecution::getOuts (&outs, &combNodes, ambigInfo, nodesPool, ruleOutputs,
spaces);

// for (unsigned j = 0; j < tlTokens.size (); j++)
// {
// cout << tlTokens[j] << endl;
// vector<pair<unsigned, unsigned> > rulees = tokenRules[j];
// for (unsigned k = 0; k < rulees.size (); k++)
// {
// cout << rulees[k].first << " , " << rulees[k].second << endl;
// }
// cout << endl;
// }
//
// for (unsigned j = 0; j < ambigInfo.size (); j++)
// {
// cout << "firTokId = " << ambigInfo[j]->firTokId << "; maxPat = "
// << ambigInfo[j]->maxPat << endl;
// vector<vector<RuleExecution::Node*> > combinations =
// ambigInfo[j]->combinations;
// cout << endl;
// for (unsigned k = 0; k < combinations.size (); k++)
// {
// vector<RuleExecution::Node*> nodes = combinations[k];
// for (unsigned l = 1; l < nodes.size (); l++)
// {
// cout << "tok=" << nodes[l]->tokenId << "; rul=" << nodes[l]->ruleId
// << "; pat=" << nodes[l]->patNum << " - ";
// }
// cout << endl;
// }
// cout << endl;
// }
//
// for (map<unsigned, map<unsigned, string> >::iterator it = ruleOutputs.begin ();
// it != ruleOutputs.end (); it++)
// {
// cout << "ruleId=" << it->first << endl;
// map<unsigned, string> outs = it->second;
//
// for (map<unsigned, string>::iterator it2 = outs.begin ();
// it2 != outs.end (); it2++)
// {
// cout << "tokId=" << it2->first << " , out = " << it2->second << endl;
// }
// cout << endl;
// }
// cout << endl;
//
// for (unsigned j = 0; j < tlTokens.size (); j++)
// {
// vector<RuleExecution::Node*> nodes = nodesPool[j];
// cout << "tokId = " << j << " : " << tlTokens[j] << endl;
// for (unsigned k = 0; k < nodes.size (); k++)
// {
// cout << "ruleId = " << nodes[k]->ruleId << "; patNum = "
// << nodes[k]->patNum << endl;
// }
// cout << endl;
// }
//
// for (unsigned j = 0; j < combNodes.size (); j++)
// {
// vector<RuleExecution::Node*> nodes = combNodes[j];
// for (unsigned k = 0; k < nodes.size (); k++)
// {
// cout << "tok=" << nodes[k]->tokenId << "; rul=" << nodes[k]->ruleId
// << "; pat=" << nodes[k]->patNum << " - ";
// }
// cout << endl;
// }

// set<string> diffOuts (outs.begin (), outs.end ());
//
// // write the outs
// for (set<string>::iterator it = diffOuts.begin (); it != diffOuts.end (); it++)
// {
// interInFile << *it << endl;
// refInFile << refSent << endl;
// }

float min = 100000;
int minInd = -1;
string serr;
float err;

// write the outs
for (unsigned j = 0; j < outs.size (); j++)
{
getline (errFile, serr);
err = strtof (serr.c_str (), NULL);

if (err < min)
{
min = err;
minInd = j;
}

interInFile << outs[j] << endl;
refInFile << refSent << endl;
}
// cout << minInd << endl;
bestInFile << outs[minInd] << endl;

interInFile << endl;
refInFile << endl;

// delete AmbigInfo pointers
for (unsigned j = 0; j < ambigInfo.size (); j++)
Expand Down Expand Up @@ -289,10 +183,7 @@ main (int argc, char **argv)

lextorFile.close ();
interInFile.close ();
refFile.close ();
refInFile.close ();
bestInFile.close ();
cout << "RulesApplier finished!";
// cout << "RulesApplier finished!";
}
else
{
Expand Down

0 comments on commit a9ee928

Please sign in to comment.