In [1]:
## This script exports some data from the dat file into a csv

## dtm_input_data/dtm_input-mult.dat -> output/csv/year_doc_word.csv

In [2]:
import numpy as np
import pandas as pd

In [3]:
## get input

# Alex:
# it’s in the file dtm_input_data/dtm_input-mult.dat
# each line is a document (in the correct order)
# the first number in a line is just how many unique words are in the document
# all of the other data in the line is in the form “word_id:number_of_occurrances”
# so like a very short document could be “3 11:5 15:1 220:7”
# that means word 11 occurred 5 times, word 15 occurred 1 time, and word 220 occurred 7 times

with open('dtm_input_data/dtm_input-mult.dat', 'r') as f:
    mult = f.readlines()
lines = [l.strip() for l in mult]
lines = [l.split()[1:] for l in lines]
words = [[[int(n) for n in s.split(':')] for s in l] for l in lines]

In [4]:
freq = dict()
# freq['a'] = 99
x = freq.setdefault('a', 0)
print(freq['a'])

0


In [5]:
## word_ids
doc_ids = list()
word_doc_freq = dict()
for doc_id, doc in enumerate(words):
    doc_ids.append(doc_id)
    for wordID_freq in doc:
        word_id = wordID_freq[0]
        freq = wordID_freq[1]
        # init with empty dict if 'word_id' does not exist in word_doc_freq
        word_doc_freq.setdefault(word_id, dict())  
        word_doc_freq[word_id][doc_id] = freq
word_ids = list(word_doc_freq.keys())

In [6]:
len(doc_ids)

6920

In [7]:
len(word_ids)

9710

In [8]:
## let's make a tidy data frame

In [9]:
## one document per row, one word per column, word freqency in cells 
list_of_rows = list()
for doc_id in doc_ids:
    row = dict()
    row['doc_id'] = doc_id
    for word_id in word_ids:
        try:
            freq = word_doc_freq[word_id][doc_id]
        except:
            freq = 0
        row[word_id] = freq
    list_of_rows.append(row)

In [10]:
doc_word_freq = pd.DataFrame(list_of_rows)

In [11]:
doc_word_freq.to_csv('output/csv/doc_word_freq.csv', index=False)

In [12]:
len(list_of_rows)

6920

In [13]:
list_of_rows[0]

{'doc_id': 0,
 0: 1,
 1: 1,
 2: 1,
 3: 2,
 4: 1,
 5: 3,
 6: 1,
 7: 28,
 8: 6,
 9: 1,
 10: 1,
 11: 2,
 12: 37,
 13: 3,
 14: 8,
 15: 1,
 16: 1,
 17: 4,
 18: 1,
 19: 1,
 20: 1,
 21: 1,
 22: 1,
 23: 1,
 24: 1,
 25: 1,
 26: 1,
 27: 1,
 28: 1,
 29: 4,
 30: 1,
 31: 1,
 32: 1,
 33: 1,
 34: 1,
 35: 1,
 36: 1,
 37: 1,
 38: 1,
 39: 1,
 40: 1,
 41: 1,
 42: 1,
 43: 1,
 44: 1,
 45: 2,
 46: 1,
 47: 1,
 48: 2,
 49: 1,
 50: 2,
 51: 6,
 52: 1,
 53: 2,
 54: 1,
 55: 3,
 56: 4,
 57: 1,
 58: 1,
 59: 1,
 60: 1,
 61: 1,
 62: 1,
 63: 5,
 64: 6,
 65: 1,
 66: 1,
 67: 1,
 68: 1,
 69: 3,
 70: 1,
 71: 1,
 72: 1,
 73: 3,
 74: 1,
 75: 14,
 76: 1,
 77: 1,
 78: 1,
 79: 1,
 80: 1,
 81: 1,
 82: 1,
 83: 1,
 84: 1,
 85: 1,
 86: 5,
 87: 7,
 88: 1,
 89: 1,
 90: 1,
 91: 2,
 92: 1,
 93: 1,
 94: 1,
 95: 1,
 96: 4,
 97: 1,
 98: 1,
 99: 1,
 100: 1,
 101: 1,
 102: 1,
 103: 2,
 104: 8,
 105: 3,
 106: 1,
 107: 3,
 108: 2,
 109: 2,
 110: 1,
 111: 2,
 112: 2,
 113: 1,
 114: 2,
 115: 1,
 116: 8,
 117: 3,
 118: 1,
 119: 1,
 120: 1,
 121

In [14]:
words[0]

[[0, 1],
 [1, 1],
 [2, 1],
 [3, 2],
 [4, 1],
 [5, 3],
 [6, 1],
 [7, 28],
 [8, 6],
 [9, 1],
 [10, 1],
 [11, 2],
 [12, 37],
 [13, 3],
 [14, 8],
 [15, 1],
 [16, 1],
 [17, 4],
 [18, 1],
 [19, 1],
 [20, 1],
 [21, 1],
 [22, 1],
 [23, 1],
 [24, 1],
 [25, 1],
 [26, 1],
 [27, 1],
 [28, 1],
 [29, 4],
 [30, 1],
 [31, 1],
 [32, 1],
 [33, 1],
 [34, 1],
 [35, 1],
 [36, 1],
 [37, 1],
 [38, 1],
 [39, 1],
 [40, 1],
 [41, 1],
 [42, 1],
 [43, 1],
 [44, 1],
 [45, 2],
 [46, 1],
 [47, 1],
 [48, 2],
 [49, 1],
 [50, 2],
 [51, 6],
 [52, 1],
 [53, 2],
 [54, 1],
 [55, 3],
 [56, 4],
 [57, 1],
 [58, 1],
 [59, 1],
 [60, 1],
 [61, 1],
 [62, 1],
 [63, 5],
 [64, 6],
 [65, 1],
 [66, 1],
 [67, 1],
 [68, 1],
 [69, 3],
 [70, 1],
 [71, 1],
 [72, 1],
 [73, 3],
 [74, 1],
 [75, 14],
 [76, 1],
 [77, 1],
 [78, 1],
 [79, 1],
 [80, 1],
 [81, 1],
 [82, 1],
 [83, 1],
 [84, 1],
 [85, 1],
 [86, 5],
 [87, 7],
 [88, 1],
 [89, 1],
 [90, 1],
 [91, 2],
 [92, 1],
 [93, 1],
 [94, 1],
 [95, 1],
 [96, 4],
 [97, 1],
 [98, 1],
 [99, 1],
 [100, 