In [1]:
# load the very latest version
import sys
sys.path.append("/Users/andyreagan/tools/python/labMTsimple/")
from labMTsimple.speedy import *
from labMTsimple.storyLab import *

import re
import codecs
from os import listdir,mkdir
from os.path import isfile,isdir
import matplotlib
matplotlib.use("pdf")
import matplotlib.pyplot as plt
from matplotlib import rc,rcParams
rc("xtick", labelsize=8)
rc("ytick", labelsize=8)
rc("font",**{"family":"serif","serif":["cmr10"]})
# rcParams["mathtext.fontset"] = "stix"
# rcParams["font.family"] = "STIXGeneral"
rc("text", usetex=True)
figwidth_onecol = 8.5
figwidth_twocol = figwidth_onecol/2

import numpy as np
from json import loads
import csv
from datetime import datetime,timedelta
import pickle

from subprocess import call

from scipy.stats import pearsonr

error_logging = True
sys.path.append("/Users/andyreagan/tools/python/kitchentable")
from dogtoys import *

In [10]:
from analyzeAll import *

In [7]:
def make_coverage_plot(allcountsListSorted,allwordsListSorted,corpus_title):
    titles = ["labMT","ANEW","LIWC","MPQA","OL","WK"]
    maxCount = 15000
    # maxCount = len(allcountsListSorted)
    total = np.sum(allcountsListSorted[:maxCount])

    def coverageMaker(wordList,sentimentDict):
        a = np.array([sentimentDict.matcherBool(word) for word in wordList[:maxCount]])
        b = np.cumsum(a)/(np.array(range(len(a)))+1)
        return a,b

    def totalCoverage(indices):
        return indices*allcountsListSorted[:maxCount]

    def covS(indices):
        return np.sum(totalCoverage(indices))/total

    def relativeCoverage(indices):
        totalCov = totalCoverage(indices)
        return np.cumsum(totalCov)/np.cumsum(allcountsListSorted[:maxCount])

    # make them all as both dicts and tries, with no stopval
    stopVal = 0.0
    labMT_trie = LabMT(stopVal=stopVal)

    LIWC_trie = LIWC(stopVal=stopVal)
    WK_trie = WK(stopVal=stopVal)
    ANEW_trie = ANEW(stopVal=stopVal)
    MPQA_trie = MPQA(stopVal=stopVal)
    OL_trie = OL(stopVal=stopVal)

    labMTcoverage,labMTcovP = coverageMaker(allwordsListSorted,labMT_trie)
    ANEWcoverage,ANEWcovP = coverageMaker(allwordsListSorted,ANEW_trie)
    LIWCcoverage,LIWCcovP = coverageMaker(allwordsListSorted,LIWC_trie)
    MPQAcoverage,MPQAcovP = coverageMaker(allwordsListSorted,MPQA_trie)
    OLcoverage,OLcovP = coverageMaker(allwordsListSorted,OL_trie)
    WKcoverage,WKcovP = coverageMaker(allwordsListSorted,WK_trie)

    allCoverage = [labMTcoverage,ANEWcoverage,LIWCcoverage,MPQAcoverage,OLcoverage,WKcoverage]
    allCovP = [labMTcovP,ANEWcovP,LIWCcovP,MPQAcovP,OLcovP,WKcovP]
    allCovPfinal = [labMTcovP[-1],ANEWcovP[-1],LIWCcovP[-1],MPQAcovP[-1],OLcovP[-1],WKcovP[-1]]

    save_individual_plots = False
    if save_individual_plots:
        plt.figure(num=None, figsize=(14, 9), dpi=600, facecolor="w", edgecolor="k")

        for i in range(len(allCovP)):
            plt.plot(range(maxCount),allCovP[i],linewidth=2)

        plt.xlabel("Word Rank")
        plt.ylabel("Percentage of individual words covered")

        plt.legend(titles,loc="best")
        mysavefig("word-coverage-by-rank-{0}.pdf".format(corpus_title),folder="../figures/coverage")
        # mysavefig("word-coverage-by-rank-{0}.png".format(corpus_title))
        plt.close()

        coveragesBySet = list(map(covS,allCoverage))

        fig, ax = plt.subplots()
        # ax.bar(range(5),coveragesBySet,0.6,color=["r","b","g","k","c"])

        ax.bar(np.arange(len(allCoverage))+0.3,coveragesBySet,0.3,color="#ef8a62",)
        ax.bar(np.arange(len(allCoverage)),allCovPfinal,0.3,color="#2b8cbe",)
        ax.set_ylabel("Percentage")
        ax.set_title("Percentage coverage over first "+str(maxCount)+" words")
        # plt.legend(np.flipud(["Total Coverage","Individual Word Coverage"]),loc="best")
        plt.legend(["Total Coverage","Individual Word Coverage"],loc="best")
        ax.set_xlim([-.15,len(titles)-.3])
        ax.set_xticks(np.arange(len(allCoverage))+.3)
        ax.set_xticklabels( titles )
        ax.set_ylim([0,1])
        mysavefig("total-coverage-bar-chart-{0}.pdf".format(corpus_title),folder="../figures/coverage")
        # mysavefig("total-coverage-bar-chart-{0}.png".format(corpus_title))
        plt.close()

        coveragesBySet2 = list(map(relativeCoverage,allCoverage))

        plt.figure(num=None, figsize=(14, 9), dpi=600, facecolor="w", edgecolor="k")
        for i in range(len(coveragesBySet2)):
            plt.plot(range(maxCount),coveragesBySet2[i],linewidth=2)
        plt.xlabel("Word Rank")
        plt.ylabel("Percentage of total words covered")
        plt.legend(titles,loc="best")
        mysavefig("relative-coverage-over-words-by-rank-{0}.pdf".format(corpus_title),folder="../figures/coverage")
        # mysavefig("relative-coverage-over-words-by-rank-{0}.png".format(corpus_title))
        plt.close()
    # endif 

    # now the full subplot figure

    plt.figure(num=None, figsize=(figwidth_onecol, figwidth_onecol*.35), dpi=600, facecolor="w", edgecolor="k")
    ax = plt.subplot(131)

    for i in range(len(allCovP)):
        ax.plot(range(maxCount),allCovP[i],linewidth=2)

    ax.set_xlabel("Word Rank",fontsize=12)
    ax.set_ylabel("Percentage of individual words covered",fontsize=12)
    ax.set_ylim([0,1])
    ax.set_yticks([0,.2,.4,.6,.8,1.0])
    # ax.legend(titles,loc="best",fontsize=10)
    ax.set_xlim([0,maxCount])
    ax.set_xticks([0,5000,10000,15000])

    coveragesBySet2 = list(map(relativeCoverage,allCoverage))

    ax = plt.subplot(132)

    for i in range(len(coveragesBySet2)):
        ax.plot(range(maxCount),coveragesBySet2[i],linewidth=2)
    ax.set_xlabel("Word Rank",fontsize=12)
    ax.set_ylabel("Percentage of total words covered",fontsize=12)
    ax.legend(titles,loc="best",fontsize=10,ncol=2,framealpha=0.5)
    ax.set_ylim([0,1])
    ax.set_yticks([0,.2,.4,.6,.8,1.0])
    ax.set_xlim([0,maxCount])
    ax.set_xticks([0,5000,10000,15000])

    coveragesBySet = list(map(covS,allCoverage))

    ax = plt.subplot(133)

    ax.bar(np.arange(len(allCoverage))+0.3,coveragesBySet,0.3,color="#ef8a62",)
    ax.bar(np.arange(len(allCoverage)),allCovPfinal,0.3,color="#2b8cbe",)
    ax.set_ylabel("Percentage",fontsize=12)
    # ax.set_title("Percentage coverage over first "+str(maxCount)+" words")
    # ax.legend(np.flipud(["Total Coverage","Individual Word Coverage"]),loc="best")
    ax.legend(["Total Coverage","Individual Word\nCoverage"],loc="best",fontsize=10,framealpha=0.5)
    ax.set_xlim([-.15,len(titles)-.3])
    ax.set_xticks( np.arange( len( allCoverage ) ) +.3 )
    ax.set_xticklabels( titles , fontsize=12 , rotation=45 )
    ax.set_ylim( [0,1] )
    ax.set_yticks( [0,.2,.4,.6,.8,1.0] )

    plt.tight_layout(pad=0.0, w_pad=0.0, h_pad=0.5)

    mysavefig("coverage-{0}.pdf".format(corpus_title),folder="../figures/coverage")
    plt.close()

In [8]:
def coverage():
    """Make each of the four main coverage plots."""

    corpus = "twitter"
    print("making coverage plot for {0}".format(corpus))
    allcountsListSorted,allwordsListSorted = loadTwitter()
    make_coverage_plot(allcountsListSorted,allwordsListSorted,corpus)

    corpus = "movieReviews"
    print("making coverage plot for {0}".format(corpus))
    allcountsListSorted,allwordsListSorted = loadMovieReviews()
    make_coverage_plot(allcountsListSorted,allwordsListSorted,corpus)

    corpus = "googleBooks"
    print("making coverage plot for {0}".format(corpus))
    allcountsListSorted,allwordsListSorted = loadGBooks()
    make_coverage_plot(allcountsListSorted,allwordsListSorted,corpus)

    corpus = "nyt"
    print("making coverage plot for {0}".format(corpus))
    allcountsListSorted,allwordsListSorted = loadNYT()
    make_coverage_plot(allcountsListSorted,allwordsListSorted,corpus)

In [9]:
coverage()

making coverage plot for twitter
there are 8441736 unique words in this corpus
loading labMT with stopVal=0.0, for 10222 words
loading data/LIWC/LIWC2007_English100131_words.dic
loading LIWC with stopVal=0.0, for 4483 words
loading WK with stopVal=0.0, for 13915 words
loading ANEW with stopVal=0.0, for 1034 words
loading MPQA with stopVal=0.0, for 7192 words
loading OL with stopVal=0.0, for 6782 words




making coverage plot for movieReviews
there are 49910 unique words in this corpus
loading labMT with stopVal=0.0, for 10222 words
loading data/LIWC/LIWC2007_English100131_words.dic
loading LIWC with stopVal=0.0, for 4483 words
loading WK with stopVal=0.0, for 13915 words
loading ANEW with stopVal=0.0, for 1034 words
loading MPQA with stopVal=0.0, for 7192 words
loading OL with stopVal=0.0, for 6782 words




making coverage plot for googleBooks
there are 9559585 unique words in this corpus
loading labMT with stopVal=0.0, for 10222 words
loading data/LIWC/LIWC2007_English100131_words.dic
loading LIWC with stopVal=0.0, for 4483 words
loading WK with stopVal=0.0, for 13915 words
loading ANEW with stopVal=0.0, for 1034 words
loading MPQA with stopVal=0.0, for 7192 words
loading OL with stopVal=0.0, for 6782 words




making coverage plot for nyt
there are 2626455 unique  words in this corpus
loading labMT with stopVal=0.0, for 10222 words
loading data/LIWC/LIWC2007_English100131_words.dic
loading LIWC with stopVal=0.0, for 4483 words
loading WK with stopVal=0.0, for 13915 words
loading ANEW with stopVal=0.0, for 1034 words
loading MPQA with stopVal=0.0, for 7192 words
loading OL with stopVal=0.0, for 6782 words


