# Generating plots for `project00` phase2

### 1. Import libraries

In [27]:
import os
import sys
import nltk
import string
import operator
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import datetime
from datetime import timedelta
from nltk.corpus import wordnet
import matplotlib.pyplot as plt
from collections import defaultdict
from matplotlib.pyplot import figure

### 2. Read the data

In [83]:
# change the working directory
os.chdir('/Users/akhil/code/csci627-project00-akhilpandey95/data/')

# read the csv into a dataframe
data = pd.read_csv('phase_two_z1835018.csv')

# print the head of the data
data.head()

Unnamed: 0,Day,Time In,Time Out,Location,Reason,User
0,2/4/19,11:00,15:55,Faraday Hall,Work,1
1,2/4/19,16:00,16:25,Founders Memorial Library,coffee,1
2,2/4/19,16:30,16:45,Faraday Hall,grab laptop,1
3,2/4/19,16:50,18:20,Computer Science Building,class,1
4,2/4/19,18:25,18:30,Faraday Hall,grab backpack,1


### 3. Preprocessing the dataframe

#### 3.1 Functions for adding value to the dataframe

In [43]:
# function for getting the total value from a column
perform_map = lambda x, y: sum(list(map(x, y)))

# function for extracting time information from
# the screentime column and return the time in hours
get_minutes = lambda x: int(x.split(":")[0]) * 60 + int(x.split(":")[1])

# function for identifying the number of hours between
# two given timestamps
def get_time(time_string_a, time_string_b):
    # set the time format
    FMT = '%H:%M'
    
    # create the timedelta calculating the difference between the
    # two time strings
    tdelta = datetime.strptime(time_string_b, FMT) - datetime.strptime(time_string_a, FMT)
    
    # if timedelta gives us a negative result then we have to
    # take then account
    if tdelta.days < 0:
        # create the time delta
        tdelta = timedelta(days=0, seconds=tdelta.seconds, microseconds=tdelta.microseconds)
        
        # separate the hours and minutes and create a string
        total_seconds = tdelta.total_seconds()
        h = total_seconds//3600
        m = (total_seconds%3600) // 60
        
        # return the value in hours
        return round(h + m/60, 2)
    else:
        # separate the hours and minutes and create a string
        secs = tdelta.total_seconds()
        h_2 = secs//3600
        m_2 = (secs%3600) // 60
        
        # return the value in hours
        return round(h_2 + m_2/60, 2)

# function for changing the pos tag
def change_pos_tag(entity):
    if entity.startswith('N'):
        return 'n'

    if entity.startswith('V'):
        return 'v'

    if entity.startswith('J'):
        return 'a'

    if entity.startswith('R'):
        return 'r'

    return None

# function for identifying the synset
def indentify_synset(word, tag):
    word_net_tag = change_pos_tag(tag)
    if word_net_tag is None:
        return None

    return wordnet.synsets(word, word_net_tag)[0]

# function for generating a sim score from 0-1
def gen_similarity_score(sentence_a, sentence_b):
    # Init the variables
    similarity_score, simulation_count = 0.0, 0

    # POS tag and takenize the given sentences
    sentence_a = nltk.pos_tag(nltk.word_tokenize(sentence_a))
    sentence_b = nltk.pos_tag(nltk.word_tokenize(sentence_b))

    # Capture the synsets from the sentence and pass the unecessary args
    # into a new empty dict
    synset_of_senta = [indentify_synset(word, tag) for word,tag in sentence_a]
    synset_of_sentb = [indentify_synset(word, tag) for word,tag in sentence_b]

    synset_of_senta = [s for s in synset_of_senta if s]
    synset_of_sentb = [s for s in synset_of_sentb if s]

    for synset in synset_of_senta:
        efficiency = max([synset.path_similarity(s) for s in synset_of_sentb])

        if efficiency is not None:
            similarity_score += efficiency
            simulation_count += 1

    similarity_score /= simulation_count
    return similarity_score

# function for assigning a label to a reason
def attach_reason(text):
    # check the score for each label
    # label 1 - class
    # label 2 - leisure
    # label 3 - work
    # label 4 - food
    result = defaultdict(dict)
    try:
        result['label_class'] = gen_similarity_score('class', text)
        result['label_leisure'] = gen_similarity_score('leisure', text)
        result['label_work'] = gen_similarity_score('work', text)
        result['label_food'] = gen_similarity_score('food', text)
        
        # check the maximum score of the labels and assign the following label
        return max(result.items(), key=operator.itemgetter(1))[0]
    except:
        return np.nan

#### 3.2 Apply the functions and create a new column

In [86]:
# add the column that attaches a label for the reson
data = data.assign(reason_nlp_label = [attach_reason(x) for x in data['Reason']])

# add the column that attaches a time for every visit
data = data.assign(time_spent = [get_time(x, y) for x, y in 
                                       list(zip(data['Time In'], data['Time Out']))])

# print the head of the column
data.head()

Unnamed: 0,Day,Time In,Time Out,Location,Reason,User,reason_nlp_label,time_spent
0,2/4/19,11:00,15:55,Faraday Hall,Work,1,label_work,4.92
1,2/4/19,16:00,16:25,Founders Memorial Library,coffee,1,label_food,0.42
2,2/4/19,16:30,16:45,Faraday Hall,grab laptop,1,,0.25
3,2/4/19,16:50,18:20,Computer Science Building,class,1,label_class,1.5
4,2/4/19,18:25,18:30,Faraday Hall,grab backpack,1,label_food,0.08


#### 3.3 Print the counts for every location

In [85]:
data.groupby('Location').count()

Unnamed: 0_level_0,Day,Time In,Time Out,Reason,User,reason_nlp_label
Location,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Adams Hall,6,6,6,6,6,3
Anderson Hall,1,1,1,1,1,0
Campus Life Building,2,2,2,2,2,0
Chipotle,2,2,2,2,2,2
Cole Hall,12,12,12,12,12,6
Computer Science Building,110,110,110,110,110,52
Dusable Hall,4,4,4,4,4,0
Engineering Building,2,2,2,2,2,0
Faraday Hall,19,19,19,19,19,16
Founders Memorial Library,27,27,27,27,27,19


### 4. Save the file

In [87]:
# save the final dataframe
data.to_csv("phase_two_z1835018.csv", sep = ',', encoding="utf-8")