Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
70 lines (54 sloc) 1.57 KB
import os
import csv
import glob
import stemparse as sp
from collections import Counter
import re
# Prepare path for all test .txt files.
base_path = os.path.abspath(os.path.dirname(__file__))
txt_path = os.path.join(base_path, "Practice", "*.txt")
print("Opening totalsstem.csv")
with open("totalsstem.csv", 'rU') as f: #opens PW file
reader = csv.reader(f)
totals = list(list(rec) for rec in csv.reader(f, delimiter=','))
# Takes in all the words and makes a matrix of the frequency of words in each document.
def makeMatrix(all_words):
all_words.sort()
matrix = [all_words + ["FILE"]]
rowLength = len(all_words)
i = 0
for file in glob.glob(txt_path):
row = [0] * (rowLength + 1)
new_wordlist = sp.parse(file)
d = Counter(new_wordlist)
for key in d:
pos = sp.binary_search(all_words, key, 0, rowLength)
if pos > -1:
row[pos] = d[key]
row[-1] = re.search('[0-9]+\.txt', file).group() # Extracts file name (Ex: "123.txt")
matrix += [row]
i += 1
print(txt_path[-20:] + " on iteration " + str(i))
return matrix
print("Starting to make word matrix.")
finalmat = makeMatrix(totals[0])
print("Making proportion matrix")
z = 0
rownum = 0
for row in finalmat:
rownum += 1
print("Making proportions for file: " + str(rownum))
if z == 0:
z += 1
continue
rowpart = [int(x) for x in row[0:len(row)-2]]
totalcount = sum([int(x) for x in rowpart])
i=0
if totalcount != 0:
for val in rowpart:
row[i] = val/(totalcount*1.0)
i += 1
with open("teststemprop.csv", "wb") as f:
writer = csv.writer(f)
writer.writerows(finalmat)
print("Script complete.")