# TO_CSV - Converts interaction logs into frequency histograms

* Runs on individual files
* Pulls data from newData/Dataset_x
* Outputs data to newCSV/Dataset_x

In [165]:
### Import necessary modules
import json
import re
import numpy as np
import csv

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [376]:
### Select participant and dataset
PID = 'p1'
dataset = '1'

filename = 'newData/Dataset_' + dataset + '/' + PID + '.json'
tfidf = 'newData/tfidf_d' + dataset + '.json'

keepWords = True

In [377]:
### Open file and pre-process data
f = open(tfidf)

TFIDFVecs = json.load(f)

histograms = TFIDFVecs['histograms']
vocabulary = TFIDFVecs['vocabulary']
titles = TFIDFVecs['titles']

userData = json.load(open(filename))['data']

docTypes = ['Doc_open', 'Reading']
createTypes = ['Search', 'Highlight', 'Connection', 'Add Note']

In [378]:
### Process logs
outVecs = []
outTimes = []
numFailed = 0
counter = 0
for data in userData:
    if data['InteractionType'] in docTypes:
        docTitle = data['ID']
        docTime = data['time']
        if docTitle in titles:
            docPos = titles.index(docTitle)
            outVecs.append(histograms[docPos])
            outTimes.append(docTime)
        else:
            numFailed = numFailed + 1
    if data['InteractionType'] in createTypes:
        string = data['Text']
        docTime = data['time']
        words = re.findall(r'\w+', string)
        tempVec = np.zeros(len(vocabulary), dtype = int)
        changed = 0
        numAdded = 0
        posAdded = []
        for word in words:
            if word in vocabulary:
                wordPos = vocabulary.index(word)
                tempVec[wordPos] = tempVec[wordPos] + 1
                numAdded = numAdded + 1
                changed = 1
                if wordPos not in posAdded:
                    posAdded.append(wordPos)
            if changed == 0:
                numFailed = numFailed + 1
            else:
                for position in posAdded:
                    tempVec[position] = tempVec[position] / numAdded
                outVecs.append(tempVec.tolist())
                outTimes.append(docTime)


In [379]:
### Export data
if keepWords == True:
    outFilename = 'wordsCSV/Dataset_' + dataset + '/' + PID + '.csv'
    f = open(outFilename, 'w', newline='')

    writer = csv.writer(f)
    header = vocabulary.copy()
    header.insert(0, 'intTime')
    writer.writerow(header)

    for i in range(len(outVecs)):
        row = outVecs[i].copy()
        row.insert(0, outTimes[i])
        writer.writerow(row)
    f.close()
else:
    outFilename = 'newCSV/Dataset_' + dataset + '/' + PID + '.csv'
    f = open(outFilename, 'w', newline='')

    writer = csv.writer(f)
    header = vocabulary.copy()
    header.insert(0, 'intTime')
    # writer.writerow(header)

    for i in range(len(outVecs)):
        row = outVecs[i].copy()
        row.insert(0, outTimes[i])
        writer.writerow(row)
    f.close()