## 1. Reading files  
  - reviews.txt: reviews 
  - labels.txt: labels

In [3]:
import System.Environment
import Control.Monad

readReviews = do
  reviews <- readFile "reviews.txt"
  return $ lines reviews
  
readLabels = do
  labels <- readFile "labels.txt"
  return $ lines labels

## 2. Count the numbers

- total word count in reviews
- positive/negative word count in reviews


In [4]:
import Data.Map as M
import Test.QuickCheck

type Counter = Map String Int

In [5]:
import Data.List.Split
splitReviews = splitOn " "

In [6]:
-- getCount
getCount :: String -> Counter -> Int
getCount word cnts = if (M.member word cnts) then (cnts M.! word) else 0

-- test: getCount
testGetCount :: String -> Int -> Bool
testGetCount word count = (getCount word $ M.fromList [(word, count)]) == (count :: Int)

print "getCount :: String -> Counter -> Int"
quickCheck testGetCount

"getCount :: String -> Counter -> Int"

+++ OK, passed 100 tests.

In [7]:
-- addCount
addCount :: Counter -> String -> Counter
addCount counter word = M.insert word ((+) 1 $ getCount word counter) counter 

-- test: addCount'
wordCountMap :: String -> Int -> Counter
wordCountMap word count = M.fromList [(word, count)]

testAddCount count word = let 
    result = (addCount (wordCountMap word count) word) 
    in result == (M.fromList [(word, count+1)])

print "addCount :: Counter -> String -> Counter"
quickCheck testAddCount

"addCount :: Counter -> String -> Counter"

+++ OK, passed 100 tests.

In [8]:
type Counters = (Int, (Counter, (Counter, Counter))) 

-- addPosNegCount
addPosNegCount :: String -> Counters -> String -> Counters
addPosNegCount label reduced word =
    let (index, (totalCount, (positiveCount, negativeCount))) = reduced
        newPositiveCount = if label == "positive" then addCount positiveCount word else positiveCount
        newNegativeCount = if label == "negative" then addCount negativeCount word else negativeCount
        newTotalCount = addCount totalCount word in 
            (index + 1, (newTotalCount, (newPositiveCount, newNegativeCount)))

-- test: addPosNegCount
testAddPosCount :: String -> Bool
testAddPosCount word = let 
    result = addPosNegCount "positive" (0, (M.empty, (M.empty, M.empty))) word
    in result == (1,(M.fromList [(word,1)],(M.fromList [(word,1)],M.fromList [])))

testAddNegCount :: String -> Bool
testAddNegCount word = let 
    result = addPosNegCount "negative" (0, (M.empty, (M.empty, M.empty))) word
    in result == (1,(M.fromList [(word,1)],(M.fromList [],M.fromList [(word,1)])))

print "addPosNegCount"
print "positive"
quickCheck testAddPosCount
print "negative"
quickCheck testAddNegCount

"addPosNegCount"

"positive"

+++ OK, passed 100 tests.

"negative"

+++ OK, passed 100 tests.

In [167]:
-- groupWords
groupWords :: String -> [String] -> Counters -> Counters
groupWords label wordList initialData = 
    Prelude.foldl (addPosNegCount label) initialData wordList

-- test: groupWords
emptyCounters = (0, (M.empty, (M.empty, M.empty)))

testGroupPositiveWords :: [String] -> Bool
testGroupPositiveWords words = let 
    result = groupWords "positive" words emptyCounters
    wordsLength = length words
    ones = replicate wordsLength 1
    counterWithWords = Prelude.foldl (\a one -> addCount a one) M.empty words
    in result == (wordsLength,(counterWithWords,(counterWithWords, M.fromList [])))

testGroupNegativeWords :: [String] -> Bool
testGroupNegativeWords words = let 
    result = groupWords "negative" words emptyCounters
    wordsLength = length words
    ones = replicate wordsLength 1
    counterWithWords = Prelude.foldl (\a one -> addCount a one) M.empty words
    in result == (wordsLength,(counterWithWords,(M.fromList [], counterWithWords)))

print "groupWords :: positive -> [String] -> Counters -> Counters"
quickCheck testGroupPositiveWords

print "groupWords :: negative -> [String] -> Counters -> Counters"
quickCheck testGroupNegativeWords

"groupWords :: positive -> [String] -> Counters -> Counters"

+++ OK, passed 100 tests.

"groupWords :: negative -> [String] -> Counters -> Counters"

+++ OK, passed 100 tests.

In [163]:
counts label review initialData = let splitted = splitReviews review in
        groupWords label splitted initialData

inputSize = 20
countAll readLabels readReviews = do  
    labels <- readLabels
    reviews <- readReviews
    let pair = zip (take inputSize labels) (take inputSize reviews)
        initialData = (0, (M.empty, (M.empty, M.empty)))
        mergeCounts = (\all -> \one -> counts (fst one) (snd one) all) in
        return $ Prelude.foldl mergeCounts initialData pair 

In [164]:
countAll readLabels readReviews >>= 
    \(total, (total_count, (pos, neg))) -> 
    let x = show $ size total_count
        y = show $ size pos
        z = show $ size neg
        in print $ "total words: " ++ x  ++ ", positive: " ++ y ++ ", negative: " ++ z


"total words: 1607, positive: 920, negative: 986"

In [162]:
safeTake :: Counter -> String -> Double
safeTake m word = if member word m then fromIntegral $ m ! word else 0

calcRatio :: Counter -> Counter -> String -> Double
calcRatio pos neg word = let 
    ratio = (pos `safeTake` word) / (1 + (neg `safeTake` word)) in 
        if ratio > 0 then log ratio else  0 - log (1/(ratio + 0.01))
calcRatio (M.fromList [("ab", 3)]) (M.fromList []) "ab"

mergeRatio :: Counter -> Counter -> Map String Double -> String -> Map String Double 
mergeRatio pos neg ratios word = let
    ratio = calcRatio pos neg word in 
    insert word ratio ratios

mergeRatio (M.fromList [("ab", 1)]) (M.fromList []) M.empty "ab"
mergeRatio (M.fromList []) (M.fromList [("ab", 1)]) M.empty "ab"

positiveToNegRatios = do
    (wordCount, (allWords, (posWords, negWords))) <- countAll readLabels readReviews
    let merge = mergeRatio posWords negWords in 
        return $ Prelude.foldl merge M.empty (keys allWords)

cutoff = 1
positiveToNegRatios >>= \x -> let r = M.filter (\cnt -> cnt > -cutoff || cnt < cutoff) x in print r

1.0986122886681098

fromList [("ab",0.0)]

fromList [("ab",-4.605170185988092)]

fromList [("",0.10018585986780887),(".",0.0),("a",0.27193371548364176),("abducted",0.0),("ability",-4.605170185988092),("able",0.0),("about",0.11778303565638346),("absence",-4.605170185988092),("absurd",-4.605170185988092),("abuse",0.0),("abused",0.0),("academy",0.0),("acclaimed",0.0),("accomplice",-4.605170185988092),("according",0.0),("accumulated",0.0),("accusations",-4.605170185988092),("accused",0.0),("achieve",0.0),("achieving",0.0),("across",-4.605170185988092),("acting",0.4054651081081644),("action",-4.605170185988092),("actions",0.0),("actor",0.0),("actors",-4.605170185988092),("actress",0.0),("actresses",-0.6931471805599453),("actual",0.0),("actually",0.4054651081081644),("add",-4.605170185988092),("adopted",0.0),("adoptive",0.0),("adults",0.0),("adventure",-4.605170185988092),("affect",0.0),("affected",-4.605170185988092),("affects",0.0),("affluence",-4.605170185988092),("afraid",-4.605170185988092),("after",0.0),("again",-0.6931471805599453),("age",0.0),("ago",-4.6051701859