In [1]:
import pymongo
from string import ascii_letters, whitespace
from collections import defaultdict, Counter
import pickle

In [18]:
ascii_set = set(ascii_letters)
allowed_chars = ascii_set.union(set(whitespace))

In [19]:
c = pymongo.MongoClient().movie.cocaine2

In [20]:
def normalize_text(text):
    text_ascii = filter(lambda x: x in allowed_chars, text)
    text_ascii_lower = text_ascii.lower()
    return text_ascii_lower

def process_document(doc, per_article = True):
    content = normalize_text(doc["ocr_eng"]).split()
    if per_article:
        content = set(content)
    title = normalize_text(doc["title"]).split()
    year = doc["date"].year
    month = doc["date"].month
    output = {"year": year, "month": month, "content": content, "title":title}
    return output

In [21]:
year_word_matrix = defaultdict(lambda: defaultdict(int))

In [22]:
def process_collection(coll, matrix, granularity = "year"):
    fails = 0
    processed = 0
    year_level = granularity == "year"
    for doc in coll.find():
        processed += 1
        print "\r" + str(processed),
        try:
            data = process_document(doc)
            if year_level:
                k = data["year"]
            else:
                k = str(data["year"]) + "-" + str(data["month"])
            for word in data["content"]:# + data["title"]:
                matrix[data["year"]][word] += 1
        except:
            fails += 1
    print fails
    return matrix

In [23]:
full_matrix = process_collection(c, year_word_matrix)

55990 511


In [24]:
len(full_matrix[1900]) # bad.

2073265

In [25]:
def cleanup_matrix(mat):
    return {y:{w:c for w, c in mat[y].iteritems() if c > 1} for y in mat.iterkeys()}

def invert_keys(mat):
    flipped_dict = defaultdict(dict)
    for k1 in mat.iterkeys():
        for k2, v in mat[k1].iteritems():
            flipped_dict[k2][k1] = v
    return dict(flipped_dict)

In [26]:
clean_matrix = invert_keys(cleanup_matrix(full_matrix))

In [27]:
clean_matrix["negro"] # good.

{1854: 3,
 1855: 2,
 1856: 3,
 1858: 3,
 1859: 10,
 1860: 61,
 1861: 30,
 1862: 5,
 1863: 9,
 1864: 3,
 1865: 3,
 1866: 17,
 1867: 3,
 1869: 4,
 1870: 3,
 1871: 24,
 1872: 29,
 1877: 4,
 1879: 4,
 1880: 4,
 1881: 6,
 1882: 3,
 1883: 2,
 1884: 5,
 1885: 82,
 1886: 63,
 1887: 66,
 1888: 51,
 1889: 55,
 1890: 101,
 1891: 69,
 1892: 85,
 1893: 129,
 1894: 96,
 1895: 77,
 1896: 127,
 1897: 213,
 1898: 242,
 1899: 316,
 1900: 475,
 1901: 618,
 1902: 375,
 1903: 366,
 1904: 263,
 1905: 327,
 1906: 328,
 1907: 452,
 1908: 381,
 1909: 481,
 1910: 342,
 1911: 290,
 1912: 212,
 1913: 274,
 1914: 182,
 1915: 170,
 1916: 75,
 1917: 44,
 1918: 38,
 1919: 93,
 1920: 99,
 1921: 82,
 1922: 87}

In [None]:
# don't do this unless you have 10 minutes to wait
# pickle.dump(dict(full_matrix), open("full_word_cooccurences.pickle","w"), protocol=2) # 500 MB

In [28]:
# this is better 
pickle.dump(clean_matrix, open("cleaned_word_cooccurences.pickle","w"), protocol=2) # 50 MB

##TODO

1. Add "base level" co-occurrence stats from 1000 random articles (need to modify data load script)

2. Add visualization or data dump to .csv

In [3]:
clean_matrix = pickle.load(open("cleaned_word_cooccurences.pickle"))

In [6]:
yearly_cocaine_counts = defaultdict()
articles_by_year = Counter([x["date"].year for x in c.find({},{"date":1,"_id":0})])

In [29]:
ratio_matrix = {word:{year: float(count) / articles_by_year[year] for year, count in clean_matrix[word].iteritems()} for word in clean_matrix.iterkeys()}

In [33]:
ratio_matrix["assault"]

{1859: 0.06451612903225806,
 1860: 0.022556390977443608,
 1861: 0.10714285714285714,
 1863: 0.12,
 1864: 0.05263157894736842,
 1866: 0.039603960396039604,
 1867: 0.18181818181818182,
 1869: 0.08,
 1871: 0.3137254901960784,
 1872: 0.2,
 1881: 0.1568627450980392,
 1882: 0.20689655172413793,
 1884: 0.05128205128205128,
 1885: 0.10894941634241245,
 1886: 0.127208480565371,
 1887: 0.12020033388981637,
 1888: 0.11049723756906077,
 1889: 0.10509554140127389,
 1890: 0.05847953216374269,
 1891: 0.1580547112462006,
 1892: 0.11096938775510204,
 1893: 0.09414758269720101,
 1894: 0.10027598896044158,
 1895: 0.08975834292289989,
 1896: 0.10526315789473684,
 1897: 0.09338747099767981,
 1898: 0.09435846230654019,
 1899: 0.11055276381909548,
 1900: 0.07569825110937092,
 1901: 0.06829372725326482,
 1902: 0.08723958333333333,
 1903: 0.06688358640636298,
 1904: 0.07288790723357261,
 1905: 0.06598182536546819,
 1906: 0.075790385448246,
 1907: 0.07486388384754991,
 1908: 0.0782552918537524,
 1909: 0.0860771

In [31]:
articles_by_year[1880]

7

In [32]:
clean_matrix["negro"][1880]

4

In [35]:
years = articles_by_year.items()
years.sort()
for y in years: print y

(1836, 1)
(1841, 1)
(1842, 1)
(1844, 4)
(1845, 3)
(1846, 1)
(1849, 1)
(1850, 1)
(1851, 2)
(1852, 4)
(1853, 2)
(1854, 3)
(1855, 5)
(1856, 6)
(1857, 5)
(1858, 26)
(1859, 31)
(1860, 133)
(1861, 56)
(1862, 18)
(1863, 25)
(1864, 38)
(1865, 6)
(1866, 101)
(1867, 22)
(1868, 3)
(1869, 25)
(1870, 21)
(1871, 51)
(1872, 50)
(1873, 8)
(1874, 10)
(1875, 6)
(1876, 6)
(1877, 15)
(1878, 5)
(1879, 8)
(1880, 7)
(1881, 51)
(1882, 29)
(1883, 11)
(1884, 39)
(1885, 514)
(1886, 566)
(1887, 599)
(1888, 362)
(1889, 314)
(1890, 684)
(1891, 658)
(1892, 784)
(1893, 1179)
(1894, 1087)
(1895, 869)
(1896, 836)
(1897, 1724)
(1898, 2003)
(1899, 2587)
(1900, 3831)
(1901, 4671)
(1902, 3072)
(1903, 2766)
(1904, 1811)
(1905, 2531)
(1906, 2309)
(1907, 2204)
(1908, 1559)
(1909, 2126)
(1910, 2208)
(1911, 1750)
(1912, 1614)
(1913, 1568)
(1914, 1212)
(1915, 1219)
(1916, 686)
(1917, 381)
(1918, 323)
(1919, 720)
(1920, 690)
(1921, 555)
(1922, 577)
