# TO_CSV - Converts interaction logs into frequency histograms

* Runs on individual user interactions and generates a CSV with the frequency of encountering a word
* Pulls data from TFIDF_d[x] and userIterations
* Outputs data to wordsCSV/Dataset_x

In [1]:
### Import necessary modules
import json
import os
import re
import numpy as np
import csv

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

In [2]:
### Select participant and dataset
dataset = 4
PID = 8
startText = ['Arms', 'Terrorist', 'Disappearance','Panda']


# filename = 'newData/Dataset_' + dataset + '/' + PID + '.json'
filename = '../data/Dataset_' + str(dataset) + '/User Interactions/' + startText[dataset-1] + '_P' + str(PID) + '_InteractionsLogs.json'
tfidf = '../data/Dataset_' + str(dataset) + '/Documents/tfidf_d' + str(dataset) + '.json'

In [10]:
### Open file and pre-process data
f = open(tfidf)

TFIDFVecs = json.load(f)

histograms = TFIDFVecs['histograms']
vocabulary = TFIDFVecs['vocabulary']
titles = TFIDFVecs['titles']

userData = json.load(open(filename))#['data'] #todo: if I access the original, can I do it with out data?
print("first 3 logged events:\n",userData[:3])

docTypes = ['Doc_open', 'Reading']
# docTypes = ['Reading']
createTypes = ['Search', 'Highlight', 'Connection','Create note', 'Add note']

first 3 logged events:
 [{'time': 0.0, 'video_timecode': '--', 'action': 'keepAlive', 'action_details': '{"eventStartTime": 1646062069517,}', 'eventStartTime': '1646062069517', 'interactionType': 'keepAlive'}, {'time': 27070.0, 'video_timecode': '0:00:27.01', 'action': 'query', 'action_details': '{"query_str": "pandas", "date_limits": [null, null], "sort": {"order": "desc"},}', 'query_str': 'pandas', 'date_limits': '[null, null]', 'sort': '{"order": "desc"}', 'interactionType': 'Search', 'text': 'pandas'}, {'time': 34360.0, 'video_timecode': '0:00:34.08', 'action': 'selectedDateOnHistogram', 'interactionType': 'selectedDateOnHistogram'}]


In [11]:
### Process logs
outVecs = []
outTimes = []
numFailed = 0
counter = 0
for event in userData:
    if event['interactionType'] in docTypes:
        docTitle = event['id']
        docTime = event['time']
        if docTitle in titles:
            # print("FoundDoc titled: " + str(docTitle))
            docPos = titles.index(docTitle)
            outVecs.append(histograms[docPos])
            outTimes.append(docTime)
            # print("Adding a historgram of length " + str(len(histograms[docPos])) + " to outVecs")
            # print("Outvecs is now: "+ str(len(outVecs))+" items long")
        else:
            print("XX- Cannot find Title in dataset: "+str(docTitle))
            numFailed = numFailed + 1
    if event['interactionType'] in createTypes:
        string = event['text']
        docTime = event['time']
        words = re.findall(r'\w+', string)
        tempVec = np.zeros(len(vocabulary), dtype = int)
        changed = 0
        numAdded = 0
        posAdded = []
        for word in words:
            word = word.lower() #set the words to lower case before checking the vocabulary.
            #if the word is in the vocabulary
            if word in vocabulary:
                #get the position of the word in the vocabulary list
                wordPos = vocabulary.index(word)
#                 print(word, "is at position:",wordPos)
                #increment the frequency of finding that word
                tempVec[wordPos] = tempVec[wordPos] + 1
                #count the total number of words processed
                numAdded = numAdded + 1
                #flag that we've updated the tempVec
                changed = 1
                #make a list of what word positions have been changed so you don't need to iterate over the whole vocabulary in the end.
                if wordPos not in posAdded:
                    posAdded.append(wordPos)
            #if the word is not in vocabulary
            if changed == 0:
                #complain
                print("XX- Cannot find word in TFIDF vocabulary: "+str(word)+"\t moving on...")
                #increment counter for the total number of errors
                numFailed = numFailed + 1
                
#                 vocabulary.append(word)
                numAdded = numAdded + 1
            else:
                for position in posAdded:
                    tempVec[position] = tempVec[position] / numAdded
                # sumWordsInEvent = 0
                # for wordFreq in tempVec:
                #     sumWordsInEvent = sumWordsInEvent + tempVec[wordFreq]
                # # print(sumWordsInEvent)
                # if (sumWordsInEvent > 0):
                outVecs.append(tempVec.tolist())
                outTimes.append(docTime)
#should print 0                
print("failed to find " + str(numFailed) + " references.")


XX- Cannot find Title in dataset: dsbz-H4BuKWENpHHl97U
XX- Cannot find Title in dataset: dsbz-H4BuKWENpHHl97U
XX- Cannot find Title in dataset: dsbz-H4BuKWENpHHl97U
XX- Cannot find Title in dataset: 1cb3-H4BuKWENpHH1Puv
XX- Cannot find Title in dataset: L8bz-H4BuKWENpHHPdyv
XX- Cannot find Title in dataset: dsbz-H4BuKWENpHHl97U
XX- Cannot find Title in dataset: 9sb3-H4BuKWENpHHlPny
XX- Cannot find Title in dataset: 9sb3-H4BuKWENpHHlPny
XX- Cannot find Title in dataset: 1Mb1-H4BuKWENpHHlOtp
XX- Cannot find Title in dataset: 1Mb1-H4BuKWENpHHlOtp
XX- Cannot find Title in dataset: f8b4-H4BuKWENpHHDf2i
XX- Cannot find Title in dataset: L8b1-H4BuKWENpHHV-qL
XX- Cannot find Title in dataset: L8b1-H4BuKWENpHHV-qL
XX- Cannot find Title in dataset: L8b1-H4BuKWENpHHV-qL
XX- Cannot find Title in dataset: L8b1-H4BuKWENpHHV-qL
XX- Cannot find Title in dataset: Asb1-H4BuKWENpHHT-pu
XX- Cannot find word in TFIDF vocabulary: nugent	 moving on...
XX- Cannot find word in TFIDF vocabulary: nugent	 moving 

In [12]:
### Export data to a set of word files with the frequency of encountering each word.
outFilename = '../data/Dataset_' + str(dataset) + '/User Words/p' + str(PID) + '.csv'
os.makedirs(os.path.dirname(outFilename), exist_ok=True)
f = open(outFilename, 'w', newline='')

writer = csv.writer(f)
header = vocabulary.copy()
header.insert(0, 'intTime')
writer.writerow(header)

for i in range(len(outVecs)):
    row = outVecs[i].copy()
    row.insert(0, outTimes[i])
    writer.writerow(row)
f.close()


# outFilename = 'newCSV/Dataset_' + dataset + '/' + PID + '.csv'
# f = open(outFilename, 'w', newline='')

# writer = csv.writer(f)
# # header = vocabulary.copy()
# # header.insert(0, 'intTime')
# # writer.writerow(header)

# for i in range(len(outVecs)):
#     row = outVecs[i].copy()
#     row.insert(0, outTimes[i])
#     writer.writerow(row)
# f.close()