Permalink
Browse files

Merge branch 'master' of github.com:cathywu/Sentiment-Analysis

Conflicts:
	movie.py
  • Loading branch information...
2 parents 7b31706 + 37d9cff commit 5361ed974e00a7919abeb811b46252a69402369e @pranjalv123 pranjalv123 committed Feb 5, 2012
Showing 1,171 changed files with 2,216 additions and 8 deletions.
View
65 README
@@ -24,3 +24,68 @@ jam [apt-get]
Important points
* L-BFGS is the default parameter estimating method in this toolkit.
+==============================================================================
+Preprocess movie data
+==============================================================================
+
+Use Qtag with the "underscore" and "process all files in directory" options
+$ java -jar qtag.jar
+
+Move the POS tagged data out to its own directory, for further processing
+$ mv pos/tagged/ pos_tagged
+$ mv neg/tagged/ neg_tagged
+
+Tag data with position
+$ python position_tagger.py -d pos
+$ python position_tagger.py -d neg
+
+Filter out for only adjectives
+$ python adjectives_filter.py -d neg
+$ python adjectives_filter.py -d pos
+
+Filter out for only verbs
+$ python verb_filter.py -d pos
+$ python verb_filter.py -d neg
+
+==============================================================================
+Preprocess Yelp data
+==============================================================================
+
+Make yelp data look like movie data in terms of formatting, and limit to 1000
+per star rating
+$ python preprocess_yelp.py -d yelp/default/1star_limited
+$ python preprocess_yelp.py -d yelp/default/2star_limited
+$ python preprocess_yelp.py -d yelp/default/3star_limited
+$ python preprocess_yelp.py -d yelp/default/4star_limited
+$ python preprocess_yelp.py -d yelp/default/5star_limited
+
+Use Qtag with the "underscore" and "process all files in directory" options
+$ java -jar qtag.jar
+
+Move the POS tagged data out to its own directory, for further processing
+$ mv 1star_limited/tagged/ 1star_limited_tagged
+$ mv 2star_limited/tagged/ 2star_limited_tagged
+$ mv 3star_limited/tagged/ 3star_limited_tagged
+$ mv 4star_limited/tagged/ 4star_limited_tagged
+$ mv 5star_limited/tagged/ 5star_limited_tagged
+
+Tag data with position
+$ python position_tagger.py -d yelp/default/1star_limited
+$ python position_tagger.py -d yelp/default/2star_limited
+$ python position_tagger.py -d yelp/default/3star_limited
+$ python position_tagger.py -d yelp/default/4star_limited
+$ python position_tagger.py -d yelp/default/5star_limited
+
+Filter out for only adjectives
+$ python adjectives_filter.py -d yelp/default/1star_limited
+$ python adjectives_filter.py -d yelp/default/2star_limited
+$ python adjectives_filter.py -d yelp/default/3star_limited
+$ python adjectives_filter.py -d yelp/default/4star_limited
+$ python adjectives_filter.py -d yelp/default/5star_limited
+
+Filter out for only verbs
+$ python verb_filter.py -d yelp/default/1star_limited
+$ python verb_filter.py -d yelp/default/2star_limited
+$ python verb_filter.py -d yelp/default/3star_limited
+$ python verb_filter.py -d yelp/default/4star_limited
+$ python verb_filter.py -d yelp/default/5star_limited
View
@@ -13,7 +13,7 @@ def filter_adj(olddir, newdir):
f = open("%s/%s" % (olddir,filename)).read().split("\n")
w = open("%s/%s" % (newdir,filename), 'w')
for word in f:
- if word[-3:]=='_JJ':
+ if word[-3:]=='_JJ' or word[-4:]=='_JJR':
w.write("%s\n" % word)
w.close()
View
@@ -13,11 +13,13 @@
POS_POSITION_DIR="pos_position"
POS_PARTOFSPEECH_DIR="pos_tagged"
POS_ADJ_DIR="pos_adj"
+POS_VERB_DIR="pos_verb"
NEG_DIR="neg"
NEG_POSITION_DIR="neg_position"
NEG_PARTOFSPEECH_DIR="neg_tagged"
NEG_ADJ_DIR="neg_adj"
+NEG_VERB_DIR="neg_verb"
YELP_DIR = "yelp/default"
@@ -218,14 +220,16 @@ def select_dataset(dataset):
return {'default':(POS_DIR, NEG_DIR), #untagged
'partofspeech':(POS_PARTOFSPEECH_DIR, NEG_PARTOFSPEECH_DIR), #part of speech tagged
'position':(POS_POSITION_DIR, NEG_POSITION_DIR), #position tagged
- 'adjectives':(POS_ADJ_DIR, NEG_ADJ_DIR) #adjectives tagged
+ 'adjectives':(POS_ADJ_DIR, NEG_ADJ_DIR), #adjectives tagged
+ 'verbs':(POS_VERB_DIR, NEG_VERB_DIR), #verbs tagged
}[dataset]
def select_extradata(dataset,stars):
- return {'default':("%s/%sstar" % (YELP_DIR,stars)), #yelp untagged
- 'partofspeech':("%s/%sstar_tagged" % (YELP_DIR,stars)), #yelp part of speech tagged
- 'position':("%s/%sstar_position" % (YELP_DIR,stars)), #yelp position tagged
- 'adjectives':("%s/%sstar_adj" % (YELP_DIR,stars)), #yelp adjectives only
+ return {'default':("%s/%sstar_limited" % (YELP_DIR,stars)), #yelp untagged
+ 'partofspeech':("%s/%sstar_limited_tagged" % (YELP_DIR,stars)), #yelp part of speech tagged
+ 'position':("%s/%sstar_limited_position" % (YELP_DIR,stars)), #yelp position tagged
+ 'adjectives':("%s/%sstar_limited_adj" % (YELP_DIR,stars)), #yelp adjectives only
+ 'verbs':("%s/%sstar_limited_verb" % (YELP_DIR,stars)), #yelp verbs only
}[dataset]
def test(classif, n=1, train_size=500, mode='k', iterations=1, dataset='',
@@ -267,9 +271,11 @@ def test(classif, n=1, train_size=500, mode='k', iterations=1, dataset='',
if __name__ == "__main__":
- n = [2]
+
+ n = [1]
dataset = 'default'
- limit = [16165]
+ limit = None
+
binary = True
idf = False
negation = True
@@ -279,6 +285,7 @@ def test(classif, n=1, train_size=500, mode='k', iterations=1, dataset='',
iterations = 3
extra_dataset=None
+
# print "Bayes:"
# test(classifier.BayesClassifier,n=n,train_size=train_size,mode=mode,iterations=iterations,
# dataset=dataset,extra_dataset=extra_dataset,limit=limit,binary=binary, idf=idf, negation = negation)
View
@@ -1,6 +1,7 @@
mind-fuck_JJ
cool_JJ
bad_JJ
+harder_JJR
lost_JJ
good_JJ
bad_JJ
View
@@ -27,6 +27,7 @@ nice_JJ
teenage_JJ
american_JJ
juvenile_JJ
+older_JJR
hard_JJ
predictable_JJ
likely_JJ
View
@@ -12,6 +12,7 @@ novel_JJ
good_JJ
good_JJ
earth-normal_JJ
+easier_JJR
advanced_JJ
little_JJ
female_JJ
@@ -45,10 +46,14 @@ boring_JJ
standard_JJ
haunted_JJ
great_JJ
+newer_JJR
unimpressive_JJ
digital_JJ
short_JJ
red_JJ
+better_JJR
+better_JJR
+better_JJR
following_JJ
classic_JJ
precinct_JJ
View
@@ -18,6 +18,7 @@ ridiculous_JJ
self-righteous_JJ
whole_JJ
unpleasant_JJ
+further_JJR
private_JJ
hired_JJ
wealthy_JJ
@@ -26,6 +27,7 @@ safe_JJ
young_JJ
specialized_JJ
easy_JJ
+deeper_JJR
obsessed_JJ
little_JJ
flickering_JJ
View
@@ -42,6 +42,7 @@ bad_JJ
weak_JJ
unimaginative_JJ
pivotal_JJ
+stronger_JJR
able_JJ
moody_JJ
pastoral_JJ
View
@@ -40,6 +40,7 @@ shoddy_JJ
banal_JJ
plentiful_JJ
hanging_JJ
+older_JJR
certain_JJ
boring_JJ
acting_JJ
View
@@ -10,6 +10,7 @@ roman_JJ
catholic_JJ
popular_JJ
leary_JJ
+easier_JJR
said_JJ
wrong_JJ
tiny_JJ
View
@@ -56,6 +56,7 @@ litefoot_JJ
mystical_JJ
huge_JJ
following_JJ
+worse_JJR
convincing_JJ
martial_JJ
believable_JJ
View
@@ -24,6 +24,7 @@ shakespearean_JJ
free_JJ
wonderful_JJ
early_JJ
+faster_JJR
nice_JJ
interesting_JJ
other_JJ
View
@@ -8,6 +8,7 @@ minor_JJ
good_JJ
only_JJ
open_JJ
+further_JJR
good_JJ
bad_JJ
unlikable_JJ
@@ -16,6 +17,7 @@ bad_JJ
inner_JJ
thin_JJ
shred_JJ
+slower_JJR
old_JJ
violent_JJ
other_JJ
View
@@ -7,6 +7,7 @@ gargantuan_JJ
imposing_JJ
sporting_JJ
big_JJ
+larger_JJR
angry_JJ
big_JJ
digital_JJ
View
@@ -49,4 +49,5 @@ compelling_JJ
religious_JJ
african-american_JJ
awful_JJ
+worse_JJR
dead_JJ
View
@@ -49,4 +49,5 @@ necessary_JJ
ultimate_JJ
integral_JJ
aforementioned_JJ
+better_JJR
certain_JJ
View
@@ -1,4 +1,5 @@
cheap_JJ
+cheaper_JJR
supposed_JJ
bizarre_JJ
endearing_JJ
View
@@ -1,6 +1,7 @@
potential_JJ
disappointing_JJ
great_JJ
+worse_JJR
sole_JJ
intelligent_JJ
then_JJ
@@ -33,6 +34,7 @@ supposed_JJ
red_JJ
stupid_JJ
cliched_JJ
+worse_JJR
interesting_JJ
complex_JJ
original_JJ
@@ -48,6 +50,7 @@ credible_JJ
respective_JJ
intriguing_JJ
great_JJ
+earlier_JJR
flawed_JJ
just_JJ
half-assed_JJ
View
@@ -35,6 +35,7 @@ encouraging_JJ
unremarkable_JJ
consistent_JJ
energetic_JJ
+quieter_JJR
successful_JJ
haughty_JJ
spare_JJ
View
@@ -38,6 +38,7 @@ gritty_JJ
close_JJ
recent_JJ
serious_JJ
+wiser_JJR
hardened_JJ
criminal_JJ
bad_JJ
View
@@ -1,4 +1,5 @@
follow-up_JJ
+better_JJR
entertaining_JJ
just_JJ
unlikely_JJ
View
@@ -11,6 +11,7 @@ various_JJ
different_JJ
unsuccessful_JJ
dull_JJ
+worse_JJR
social_JJ
other_JJ
confusing_JJ
View
@@ -22,9 +22,11 @@ graphic_JJ
uneasy_JJ
exploitative_JJ
gratuitous_JJ
+zipper_JJR
female_JJ
supposed_JJ
bare_JJ
+harder_JJR
literal_JJ
short_JJ
rampant_JJ
View
@@ -29,3 +29,4 @@ fatal_JJ
soaring_JJ
worrying_JJ
slim_JJ
+younger_JJR
View
@@ -30,6 +30,7 @@ unnatural_JJ
placid_JJ
little_JJ
semi-mystical_JJ
+better_JJR
urban_JJ
radioactive_JJ
outer_JJ
View
@@ -33,6 +33,7 @@ bad_JJ
curious_JJ
hurricaine_JJ
hateful_JJ
+better_JJR
other_JJ
great_JJ
tight_JJ
View
@@ -12,6 +12,7 @@ lost_JJ
reputable_JJ
clear_JJ
monstrous_JJ
+worse_JJR
other_JJ
normal_JJ
shocking_JJ
View
@@ -20,6 +20,7 @@ other_JJ
random_JJ
evident_JJ
fancy_JJ
+better_JJR
interesting_JJ
traditional_JJ
hackneyed_JJ
View
@@ -45,3 +45,4 @@ high_JJ
explicit_JJ
pervasive_JJ
appropriate_JJ
+younger_JJR
View
@@ -20,7 +20,9 @@ big_JJ
funny_JJ
geeky_JJ
bad_JJ
+better_JJR
supposed_JJ
+younger_JJR
nerdy_JJ
funny_JJ
popular_JJ
View
@@ -6,6 +6,7 @@ paramount_JJ
nervous_JJ
other_JJ
hard_JJ
+worse_JJR
overworked_JJ
cloud-choked_JJ
initial_JJ
Oops, something went wrong.

0 comments on commit 5361ed9

Please sign in to comment.