Skip to content

Commit

Permalink
Big refactoring to cmu.arktweetnlp package
Browse files Browse the repository at this point in the history
  • Loading branch information
brendano committed Aug 15, 2012
1 parent c200523 commit 681d8c3
Show file tree
Hide file tree
Showing 71 changed files with 258 additions and 432 deletions.
1 change: 1 addition & 0 deletions .gitignore
@@ -1,4 +1,5 @@
.svn
.DS_Store

.idea
*.iml
Expand Down
22 changes: 22 additions & 0 deletions HACKING.txt
@@ -0,0 +1,22 @@
Setting up Eclipse
==================

We use the following procedure to set up Eclipse for development. The tricky
bit is to get the jar dependencies by invoking Maven. There may be other ways
of doing it.

(1) Type: "mvn package". Among other things, this downloads all dependencies
into ark-tweet-nlp/src/target . It also puts tons of crap into ~/.m2.
(2) Go to Eclipse and refresh the directory (e.g. F5)
(3) In Eclipse, go to the project's
Properties -> Java Build Path -> Libraries -> "Add JARs".
Add jars from:
(a) lib/
(b) ark-tweet-nlp/src/target/bin/

Once Eclipse is compiling all the source files, then we use scripts/java.sh to
train and run the tagger. It invokes "java" with all dependencies and
Eclipse-compiled .class files on the classpath. So you can tell Eclipse to
build files (and in fact this happens automatically), then switch to the
terminal to run the tagger.

4 changes: 2 additions & 2 deletions LICENSE → LICENSE.txt
@@ -1,7 +1,7 @@
Everything is licensed under the Apache License version 2.0:
All original CMU ARK TweetNLP code is licensed under the Apache License version 2.0:
http://www.apache.org/licenses/LICENSE-2.0

edu.cmu.cs.lti.ark.ssl is Copyright 2011, Dipanjan Das.
Included libraries have various licenses.

posBerkeley.jar is Copyright 2011, Taylor Berg-Kirkpatrick. Licensed as Apache 2.0:

Expand Down
15 changes: 10 additions & 5 deletions README → README.txt
@@ -1,16 +1,21 @@
CMU ARK Twitter Part-of-Speech Tagger v0.2.1
CMU ARK Twitter Part-of-Speech Tagger v0.3-pre
http://www.ark.cs.cmu.edu/TweetNLP/

Basic usage
-----------
===========

Requires Java 6. To run the tagger from unix shell:

./runTagger.sh example_tweets.txt modelfile > tagged_tweets.txt

Requires Java 6. To run the tagger:
Another example:

./runTagger.sh -input example_tweets.txt -output tagged_tweets.txt
./runTagger.sh -input barackobama.txt -input_format json -output tagged_barackobama.txt
./runTagger.sh --input-format json barackobama.jsonlines.txt -output tagged_barackobama.txt

The outputs should match tagged_tweets_expected.txt and barackobamaexpected.txt respectively.



Advanced usage
--------------

Expand Down
14 changes: 7 additions & 7 deletions ark-tweet-nlp/pom.xml
@@ -1,11 +1,11 @@
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.cmu.cs.lti</groupId>
<artifactId>ark</artifactId>
<groupId>edu.cmu.cs</groupId>
<artifactId>ark-tweet-nlp</artifactId>
<packaging>jar</packaging>
<version>1.0-SNAPSHOT</version>
<name>CMU Ark twitter POS tagger</name>
<version>0.3-SNAPSHOT</version>
<name>CMU ARK TweetNLP: Twitter POS tagger</name>
<url>http://www.ark.cs.cmu.edu/TweetNLP/</url>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
Expand All @@ -25,7 +25,7 @@
</goals>
<configuration>
<sources>
<source>${basedir}/../src/newalgo</source>
<source>${basedir}/../src</source>
</sources>
</configuration>
</execution>
Expand All @@ -44,7 +44,7 @@
<configuration>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass>edu.cmu.cs.lti.ark.tweetnlp.RunPOSTagger</mainClass>
<mainClass>cmu.arktweetnlp.RunTagger</mainClass>
</transformer>
</transformers>
</configuration>
Expand All @@ -68,7 +68,7 @@
<archive>
<index>true</index>
<manifest>
<mainClass>edu.cmu.cs.lti.ark.tweetnlp.RunPOSTagger</mainClass>
<mainClass>cmu.arktweetnlp.RunTagger</mainClass>
<addClasspath>true</addClasspath>
</manifest>
</archive>
Expand Down
Expand Up @@ -7,7 +7,8 @@
import java.util.Collection;
import java.util.List;

import edu.cmu.cs.lti.ark.ssl.util.BasicFileIO;
import cmu.arktweetnlp.util.BasicFileIO;

import fig.basic.Pair;


Expand Down
Expand Up @@ -14,6 +14,8 @@

import org.apache.commons.codec.language.Metaphone;

import cmu.arktweetnlp.impl.TagDictionary;

import fig.basic.Pair;

/**
Expand Down
Expand Up @@ -5,6 +5,8 @@
import java.util.*;
import java.util.logging.Logger;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.berkeley.nlp.math.DifferentiableFunction;
import edu.berkeley.nlp.util.CallbackFunction;
import edu.berkeley.nlp.util.Counter;
Expand Down
Expand Up @@ -7,7 +7,8 @@
import java.util.StringTokenizer;
import java.util.logging.Logger;

import edu.cmu.cs.lti.ark.ssl.util.BasicFileIO;
import cmu.arktweetnlp.util.BasicFileIO;

import fig.basic.Pair;

public class TabSeparatedFileReader {
Expand Down
Expand Up @@ -5,7 +5,8 @@
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import edu.cmu.cs.lti.ark.ssl.util.BasicFileIO;

import cmu.arktweetnlp.util.BasicFileIO;



Expand Down
@@ -1,7 +1,6 @@
package edu.cmu.cs.lti.ark.ssl.pos.eval;

import edu.cmu.cs.lti.ark.ssl.pos.TabSeparatedFileReader;
import edu.cmu.cs.lti.ark.ssl.util.BasicFileIO;

import java.io.BufferedReader;
import java.io.BufferedWriter;
Expand All @@ -14,6 +13,8 @@
import java.util.Map;
import java.util.StringTokenizer;

import cmu.arktweetnlp.util.BasicFileIO;

import fig.basic.Pair;

public class ConvertTBToUPos {
Expand Down
Expand Up @@ -6,9 +6,10 @@
import java.util.List;
import java.util.Map;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.berkeley.nlp.util.Counter;
import edu.cmu.cs.lti.ark.ssl.pos.POSModel;
import edu.cmu.cs.lti.ark.ssl.util.BasicFileIO;

public class ModelDump {
public static void main(String[] args) {
Expand Down
Expand Up @@ -10,6 +10,8 @@
import java.util.Map;
import java.util.Set;

import cmu.arktweetnlp.util.BasicFileIO;

public class AverageMultinomials {

public static Map<String, String> languageMap;
Expand Down
Expand Up @@ -5,6 +5,8 @@
import java.util.Collection;
import java.util.List;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.cmu.cs.lti.ark.ssl.pos.TabSeparatedFileReader;
import edu.cmu.cs.lti.ark.ssl.pos.UnlabeledSentencesReader;

Expand Down
Expand Up @@ -8,6 +8,8 @@
import java.util.List;
import java.util.Set;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.cmu.cs.lti.ark.ssl.pos.TabSeparatedFileReader;
import fig.basic.Pair;

Expand Down
Expand Up @@ -6,6 +6,8 @@
import java.util.Map;
import java.util.Set;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.cmu.cs.lti.ark.ssl.pos.POSModel;

public class CheckPOSModel {
Expand Down
Expand Up @@ -12,6 +12,8 @@
import java.util.Map;
import java.util.Set;

import cmu.arktweetnlp.util.BasicFileIO;

public class CheckUniquenessOfMaps {
public static void main(String[] args) {
// uniqueness();
Expand Down
Expand Up @@ -4,6 +4,8 @@
import java.io.BufferedWriter;
import java.util.StringTokenizer;

import cmu.arktweetnlp.util.BasicFileIO;

public class CleanupAnnotations {
public static void main(String[] args) {
String file = args[0];
Expand Down
Expand Up @@ -12,6 +12,8 @@
import java.util.Set;
import java.util.StringTokenizer;

import cmu.arktweetnlp.util.BasicFileIO;

public class ComputeInitialTransitionFeatures {

public static Random baseRand = new Random(43569);
Expand Down
Expand Up @@ -7,6 +7,8 @@
import java.util.List;
import java.util.Collection;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.cmu.cs.lti.ark.ssl.pos.TabSeparatedFileReader;
import fig.basic.Pair;

Expand Down
Expand Up @@ -4,6 +4,8 @@
import java.io.BufferedWriter;
import java.util.ArrayList;

import cmu.arktweetnlp.util.BasicFileIO;

public class ConvertToCoNLLFormat {
public static final String[] inputArr = {"wsj-02-21.MRG.MST.suited",
"wsj-22.MRG.MST.suited",
Expand Down
Expand Up @@ -6,6 +6,8 @@
import java.util.Collection;
import java.util.List;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.cmu.cs.lti.ark.ssl.pos.TabSeparatedFileReader;
import fig.basic.Pair;

Expand Down
Expand Up @@ -6,6 +6,8 @@
import java.util.Iterator;
import java.util.List;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.cmu.cs.lti.ark.ssl.pos.TabSeparatedFileReader;
import fig.basic.Pair;

Expand Down
Expand Up @@ -9,6 +9,8 @@
import java.util.Set;
import java.util.StringTokenizer;

import cmu.arktweetnlp.util.BasicFileIO;

public class ExpandCoarseTagDictionary {
public static void main(String[] args) {
String directory = "/home/dipanjan/Downloads";
Expand Down
Expand Up @@ -13,6 +13,8 @@
import java.util.Map;
import java.util.Set;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.berkeley.nlp.util.ArrayUtil;
import edu.cmu.cs.lti.ark.ssl.pos.PennTreeBankPOSSequenceReader;
import edu.cmu.cs.lti.ark.ssl.pos.TabSeparatedFileReader;
Expand Down
Expand Up @@ -6,6 +6,8 @@
import java.util.Map;
import java.util.Set;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.berkeley.nlp.util.ArrayUtil;
import edu.cmu.cs.lti.ark.ssl.pos.POSModel;

Expand Down
Expand Up @@ -6,6 +6,8 @@
import java.util.Map;
import java.util.Set;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.berkeley.nlp.util.ArrayUtil;
import edu.cmu.cs.lti.ark.ssl.pos.POSModel;

Expand Down
Expand Up @@ -10,6 +10,8 @@
import java.util.Set;
import java.util.StringTokenizer;

import cmu.arktweetnlp.util.BasicFileIO;

import fig.basic.Pair;

public class ProjectAlignedTags {
Expand Down
Expand Up @@ -9,6 +9,8 @@
import java.util.List;
import java.util.Collection;

import cmu.arktweetnlp.util.BasicFileIO;



public class TabSeparatedPOSPrinting {
Expand Down
Expand Up @@ -9,6 +9,8 @@
import java.util.List;
import java.util.Set;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.cmu.cs.lti.ark.ssl.pos.TabSeparatedFileReader;
import fig.basic.Pair;

Expand Down
Expand Up @@ -5,12 +5,14 @@
import java.util.List;
import java.util.Scanner;

import cmu.arktweetnlp.Twokenize;
import cmu.arktweetnlp.util.BasicFileIO;

import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.core.JsonParser;

import edu.cmu.cs.lti.ark.ssl.pos.POSOptions;
import edu.cmu.cs.lti.ark.ssl.pos.SemiSupervisedPOSTagger;
import edu.cmu.cs.lti.ark.ssl.util.BasicFileIO;
import fig.basic.OptionsParser;
import fig.basic.Option;

Expand Down
Expand Up @@ -4,11 +4,12 @@
import java.util.List;
import java.util.logging.Level;

import cmu.arktweetnlp.util.BasicFileIO;

import edu.cmu.cs.lti.ark.ssl.pos.POSFeatureTemplates;
import edu.cmu.cs.lti.ark.ssl.pos.POSModel;
import edu.cmu.cs.lti.ark.ssl.pos.POSOptions;
import edu.cmu.cs.lti.ark.ssl.pos.SemiSupervisedPOSTagger;
import edu.cmu.cs.lti.ark.ssl.util.BasicFileIO;
import fig.basic.Pair;

/** Wraps SemiSupervisedPOSTagger for easier inference-only usage (i.e. to tag new sentences) */
Expand Down

0 comments on commit 681d8c3

Please sign in to comment.