In [8]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import datetime
import csv

In [9]:
dataset_df = pd.read_csv("Synthetic User Trips Data.csv")
dataset_df

Unnamed: 0,START_DATE*,END_DATE*,START*,STOP*,MILES*,PLACE_TYPE*,USER*
0,1/1/2016 21:11,1/1/2016 21:17,Fort Pierce,Fort Pierce,5.1,BAR,Courtney Jones
1,1/2/2016 1:25,1/2/2016 1:37,Fort Pierce,Fort Pierce,5.0,OTHERS,Traci Braun
2,1/2/2016 20:25,1/2/2016 20:38,Fort Pierce,Fort Pierce,4.8,OTHERS,Margaret Brock
3,1/5/2016 17:31,1/5/2016 17:45,Fort Pierce,Fort Pierce,4.7,SHOPPING,William Walker
4,1/6/2016 14:42,1/6/2016 15:49,Fort Pierce,West Palm Beach,63.7,RESTAURANT,Jeremy Price
...,...,...,...,...,...,...,...
1150,12/31/2016 1:07,12/31/2016 1:14,Kar?chi,Kar?chi,0.7,RESTAURANT,Amber Young
1151,12/31/2016 13:24,12/31/2016 13:42,Kar?chi,Unknown Location,3.9,SHOPPING,Rose Cruz
1152,12/31/2016 15:03,12/31/2016 15:38,Unknown Location,Unknown Location,16.2,SHOPPING,Mark Rodriguez DDS
1153,12/31/2016 21:32,12/31/2016 21:50,Katunayake,Gampaha,6.4,RESTAURANT,William Walker


In [10]:
# Classes
# 0-3->0, 3-6->1, 6-9->2, 9-12->3, 12-15->4, 15-18->5, 18-21->6, 21-24->7
time_classes = {
    0: [datetime.time(0,0), datetime.time(2,59)],
    1: [datetime.time(3,0), datetime.time(5,59)],
    2: [datetime.time(6,0), datetime.time(8,59)],
    3: [datetime.time(9,0), datetime.time(11,59)],
    4: [datetime.time(12,0), datetime.time(14,59)],
    5: [datetime.time(15,0), datetime.time(17,59)],
    6: [datetime.time(18,0), datetime.time(20,59)],
    7: [datetime.time(21,0), datetime.time(23,59)]
}
def get_time_in_classes(complete_time):
    time_part = datetime.datetime.strptime(complete_time, "%m/%d/%Y %H:%M").time()
    for time_class in time_classes.keys():
        times = time_classes[time_class]
        if time_part >= times[0] and time_part <= times[1]:
            return time_class
    return -1

In [11]:
dataset_df['START_TIME_CLASS*'] = dataset_df['START_DATE*'].apply(lambda x: get_time_in_classes(x))
dataset_df['END_TIME_CLASS*'] = dataset_df['END_DATE*'].apply(lambda x: get_time_in_classes(x))
dataset_df

Unnamed: 0,START_DATE*,END_DATE*,START*,STOP*,MILES*,PLACE_TYPE*,USER*,START_TIME_CLASS*,END_TIME_CLASS*
0,1/1/2016 21:11,1/1/2016 21:17,Fort Pierce,Fort Pierce,5.1,BAR,Courtney Jones,7,7
1,1/2/2016 1:25,1/2/2016 1:37,Fort Pierce,Fort Pierce,5.0,OTHERS,Traci Braun,0,0
2,1/2/2016 20:25,1/2/2016 20:38,Fort Pierce,Fort Pierce,4.8,OTHERS,Margaret Brock,6,6
3,1/5/2016 17:31,1/5/2016 17:45,Fort Pierce,Fort Pierce,4.7,SHOPPING,William Walker,5,5
4,1/6/2016 14:42,1/6/2016 15:49,Fort Pierce,West Palm Beach,63.7,RESTAURANT,Jeremy Price,4,5
...,...,...,...,...,...,...,...,...,...
1150,12/31/2016 1:07,12/31/2016 1:14,Kar?chi,Kar?chi,0.7,RESTAURANT,Amber Young,0,0
1151,12/31/2016 13:24,12/31/2016 13:42,Kar?chi,Unknown Location,3.9,SHOPPING,Rose Cruz,4,4
1152,12/31/2016 15:03,12/31/2016 15:38,Unknown Location,Unknown Location,16.2,SHOPPING,Mark Rodriguez DDS,5,5
1153,12/31/2016 21:32,12/31/2016 21:50,Katunayake,Gampaha,6.4,RESTAURANT,William Walker,7,7


In [12]:
def get_interest(df):
    return df['PLACE_TYPE*'] + " " + str(df['END_TIME_CLASS*'])

In [13]:
dataset_df['INTEREST*'] = dataset_df.apply(get_interest,axis=1)
dataset_df

Unnamed: 0,START_DATE*,END_DATE*,START*,STOP*,MILES*,PLACE_TYPE*,USER*,START_TIME_CLASS*,END_TIME_CLASS*,INTEREST*
0,1/1/2016 21:11,1/1/2016 21:17,Fort Pierce,Fort Pierce,5.1,BAR,Courtney Jones,7,7,BAR 7
1,1/2/2016 1:25,1/2/2016 1:37,Fort Pierce,Fort Pierce,5.0,OTHERS,Traci Braun,0,0,OTHERS 0
2,1/2/2016 20:25,1/2/2016 20:38,Fort Pierce,Fort Pierce,4.8,OTHERS,Margaret Brock,6,6,OTHERS 6
3,1/5/2016 17:31,1/5/2016 17:45,Fort Pierce,Fort Pierce,4.7,SHOPPING,William Walker,5,5,SHOPPING 5
4,1/6/2016 14:42,1/6/2016 15:49,Fort Pierce,West Palm Beach,63.7,RESTAURANT,Jeremy Price,4,5,RESTAURANT 5
...,...,...,...,...,...,...,...,...,...,...
1150,12/31/2016 1:07,12/31/2016 1:14,Kar?chi,Kar?chi,0.7,RESTAURANT,Amber Young,0,0,RESTAURANT 0
1151,12/31/2016 13:24,12/31/2016 13:42,Kar?chi,Unknown Location,3.9,SHOPPING,Rose Cruz,4,4,SHOPPING 4
1152,12/31/2016 15:03,12/31/2016 15:38,Unknown Location,Unknown Location,16.2,SHOPPING,Mark Rodriguez DDS,5,5,SHOPPING 5
1153,12/31/2016 21:32,12/31/2016 21:50,Katunayake,Gampaha,6.4,RESTAURANT,William Walker,7,7,RESTAURANT 7


In [14]:
#Cutting out extra columns
dataset_df = dataset_df[['USER*', 'INTEREST*']]
dataset_df

Unnamed: 0,USER*,INTEREST*
0,Courtney Jones,BAR 7
1,Traci Braun,OTHERS 0
2,Margaret Brock,OTHERS 6
3,William Walker,SHOPPING 5
4,Jeremy Price,RESTAURANT 5
...,...,...
1150,Amber Young,RESTAURANT 0
1151,Rose Cruz,SHOPPING 4
1152,Mark Rodriguez DDS,SHOPPING 5
1153,William Walker,RESTAURANT 7


In [15]:
dataset_df_groups = dataset_df.groupby(['USER*', 'INTEREST*'])
rated_dataset_df = dataset_df_groups.size().reset_index(name="RATING*")
rated_dataset_df

Unnamed: 0,USER*,INTEREST*,RATING*
0,Aaron Webb,BAR 5,1
1,Aaron Webb,BAR 7,1
2,Aaron Webb,GYM 3,1
3,Aaron Webb,GYM 5,2
4,Aaron Webb,GYM 6,2
...,...,...,...
866,William Walker,RESTAURANT 5,1
867,William Walker,RESTAURANT 7,1
868,William Walker,SHOPPING 3,3
869,William Walker,SHOPPING 5,1


In [40]:
# rated_dataset_df.to_csv('Rated User Interests.csv', index=False)

## Get interests dataset

In [1]:
#copied from synthetic dataset generator notebook
PLACE_TYPE = [
    "RESTAURANT",
    "BAR",
    "GYM",
    "OFFICE",
    "PLAY ARENA",
    "SHOPPING",
    "OTHERS"
]

In [16]:
interests_dataset = []
idx_counter = 0
for place in PLACE_TYPE:
    for time_class in range(0,8):
        interests_dataset.append({
            "INTEREST_ID*": idx_counter,
            "INTEREST*": place + " " + str(time_class)
        })
        idx_counter += 1
interests_df = pd.DataFrame(interests_dataset)
interests_df

Unnamed: 0,INTEREST_ID*,INTEREST*
0,0,RESTAURANT 0
1,1,RESTAURANT 1
2,2,RESTAURANT 2
3,3,RESTAURANT 3
4,4,RESTAURANT 4
5,5,RESTAURANT 5
6,6,RESTAURANT 6
7,7,RESTAURANT 7
8,8,BAR 0
9,9,BAR 1


In [17]:
interests_df.to_csv('Interests Dataset.csv', index=False)

### Map interest to interest id in rated dataset

In [20]:
interest_to_interest_id_mapper = dict(zip(interests_df['INTEREST*'], interests_df['INTEREST_ID*']))
interest_to_interest_id_mapper

{'BAR 0': 8,
 'BAR 1': 9,
 'BAR 2': 10,
 'BAR 3': 11,
 'BAR 4': 12,
 'BAR 5': 13,
 'BAR 6': 14,
 'BAR 7': 15,
 'GYM 0': 16,
 'GYM 1': 17,
 'GYM 2': 18,
 'GYM 3': 19,
 'GYM 4': 20,
 'GYM 5': 21,
 'GYM 6': 22,
 'GYM 7': 23,
 'OFFICE 0': 24,
 'OFFICE 1': 25,
 'OFFICE 2': 26,
 'OFFICE 3': 27,
 'OFFICE 4': 28,
 'OFFICE 5': 29,
 'OFFICE 6': 30,
 'OFFICE 7': 31,
 'OTHERS 0': 48,
 'OTHERS 1': 49,
 'OTHERS 2': 50,
 'OTHERS 3': 51,
 'OTHERS 4': 52,
 'OTHERS 5': 53,
 'OTHERS 6': 54,
 'OTHERS 7': 55,
 'PLAY ARENA 0': 32,
 'PLAY ARENA 1': 33,
 'PLAY ARENA 2': 34,
 'PLAY ARENA 3': 35,
 'PLAY ARENA 4': 36,
 'PLAY ARENA 5': 37,
 'PLAY ARENA 6': 38,
 'PLAY ARENA 7': 39,
 'RESTAURANT 0': 0,
 'RESTAURANT 1': 1,
 'RESTAURANT 2': 2,
 'RESTAURANT 3': 3,
 'RESTAURANT 4': 4,
 'RESTAURANT 5': 5,
 'RESTAURANT 6': 6,
 'RESTAURANT 7': 7,
 'SHOPPING 0': 40,
 'SHOPPING 1': 41,
 'SHOPPING 2': 42,
 'SHOPPING 3': 43,
 'SHOPPING 4': 44,
 'SHOPPING 5': 45,
 'SHOPPING 6': 46,
 'SHOPPING 7': 47}

In [22]:
def find_interest_id_from_interest(interest):
    return interest_to_interest_id_mapper[interest]

In [23]:
rated_dataset_df['INTEREST_ID*'] = rated_dataset_df['INTEREST*'].apply(lambda x: find_interest_id_from_interest(x))
rated_dataset_df

Unnamed: 0,USER*,INTEREST*,RATING*,INTEREST_ID*
0,Aaron Webb,BAR 5,1,13
1,Aaron Webb,BAR 7,1,15
2,Aaron Webb,GYM 3,1,19
3,Aaron Webb,GYM 5,2,21
4,Aaron Webb,GYM 6,2,22
...,...,...,...,...
866,William Walker,RESTAURANT 5,1,5
867,William Walker,RESTAURANT 7,1,7
868,William Walker,SHOPPING 3,3,43
869,William Walker,SHOPPING 5,1,45


In [24]:
# copied from synthetic dataset generator notebook
USERS = [
    "Erika Brown",
    "John Peters",
    "Mark Rodriguez DDS",
    "Alexandra Lewis",
    "Michael Mckinney",
    "David Whitaker",
    "Steven Jones",
    "Patricia Marshall",
    "Rachel Williams",
    "Michelle Day",
    "John Medina",
    "Cynthia Campbell",
    "William Walker",
    "Kylie Gordon",
    "Margaret Brock",
    "Alexis Barry",
    "Richard Reid",
    "Kelly Torres",
    "Maria Tran",
    "David Hartman",
    "Heather Maxwell",
    "Amber Young",
    "Aaron Webb",
    "Nancy Brennan",
    "Heather Mcguire",
    "Brendan Rivera",
    "Elizabeth Gross",
    "David Rodriguez",
    "Samantha Coleman",
    "Courtney Jones",
    "Tracy Hanna",
    "Paul Smith",
    "Traci Braun",
    "Rose Cruz",
    "Ryan Barnes",
    "Sophia Hernandez",
    "David Patel",
    "Alexis Wang",
    "Jeremy Price",
    "William Jennings",
    "Sarah Peck",
    "Lance Chan",
    "Troy Stewart",
    "Alexandria Barrett",
    "George Thomas Jr.",
    "Chad Davis",
    "Wesley Wilson",
    "Lynn Elliott",
    "Matthew Russell",
    "Nicole Garrett"
]

In [25]:
users_dataset = []
idx_counter = 0
for user in USERS:
    users_dataset.append({
        "USER_ID*": idx_counter,
        "USER*": user
    })
    idx_counter += 1
users_df = pd.DataFrame(users_dataset)
users_df

Unnamed: 0,USER_ID*,USER*
0,0,Erika Brown
1,1,John Peters
2,2,Mark Rodriguez DDS
3,3,Alexandra Lewis
4,4,Michael Mckinney
5,5,David Whitaker
6,6,Steven Jones
7,7,Patricia Marshall
8,8,Rachel Williams
9,9,Michelle Day


In [27]:
users_df.to_csv('Users Dataset.csv', index=False)

In [29]:
user_to_user_id_mapper = dict(zip(users_df['USER*'], users_df['USER_ID*']))
user_to_user_id_mapper

{'Aaron Webb': 22,
 'Alexandra Lewis': 3,
 'Alexandria Barrett': 43,
 'Alexis Barry': 15,
 'Alexis Wang': 37,
 'Amber Young': 21,
 'Brendan Rivera': 25,
 'Chad Davis': 45,
 'Courtney Jones': 29,
 'Cynthia Campbell': 11,
 'David Hartman': 19,
 'David Patel': 36,
 'David Rodriguez': 27,
 'David Whitaker': 5,
 'Elizabeth Gross': 26,
 'Erika Brown': 0,
 'George Thomas Jr.': 44,
 'Heather Maxwell': 20,
 'Heather Mcguire': 24,
 'Jeremy Price': 38,
 'John Medina': 10,
 'John Peters': 1,
 'Kelly Torres': 17,
 'Kylie Gordon': 13,
 'Lance Chan': 41,
 'Lynn Elliott': 47,
 'Margaret Brock': 14,
 'Maria Tran': 18,
 'Mark Rodriguez DDS': 2,
 'Matthew Russell': 48,
 'Michael Mckinney': 4,
 'Michelle Day': 9,
 'Nancy Brennan': 23,
 'Nicole Garrett': 49,
 'Patricia Marshall': 7,
 'Paul Smith': 31,
 'Rachel Williams': 8,
 'Richard Reid': 16,
 'Rose Cruz': 33,
 'Ryan Barnes': 34,
 'Samantha Coleman': 28,
 'Sarah Peck': 40,
 'Sophia Hernandez': 35,
 'Steven Jones': 6,
 'Traci Braun': 32,
 'Tracy Hanna': 3

In [32]:
def find_user_id_from_user(user):
    return user_to_user_id_mapper[user]

In [33]:
rated_dataset_df['USER_ID*'] = rated_dataset_df['USER*'].apply(lambda x: find_user_id_from_user(x))
rated_dataset_df

Unnamed: 0,USER*,INTEREST*,RATING*,INTEREST_ID*,USER_ID*
0,Aaron Webb,BAR 5,1,13,22
1,Aaron Webb,BAR 7,1,15,22
2,Aaron Webb,GYM 3,1,19,22
3,Aaron Webb,GYM 5,2,21,22
4,Aaron Webb,GYM 6,2,22,22
...,...,...,...,...,...
866,William Walker,RESTAURANT 5,1,5,12
867,William Walker,RESTAURANT 7,1,7,12
868,William Walker,SHOPPING 3,3,43,12
869,William Walker,SHOPPING 5,1,45,12


In [35]:
rated_dataset_df = rated_dataset_df.drop(['USER*', 'INTEREST*'], axis = 1)
rated_dataset_df

Unnamed: 0,RATING*,INTEREST_ID*,USER_ID*
0,1,13,22
1,1,15,22
2,1,19,22
3,2,21,22
4,2,22,22
...,...,...,...
866,1,5,12
867,1,7,12
868,3,43,12
869,1,45,12


In [38]:
rated_dataset_df.to_csv('Rated User Interests.csv', index=False)