In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import datetime
import csv

# Load the dataset

In [2]:
dataset_df = pd.read_csv("My Uber Drives - 2016.csv")
dataset_df

Unnamed: 0,START_DATE*,END_DATE*,CATEGORY*,START*,STOP*,MILES*,PURPOSE*
0,1/1/2016 21:11,1/1/2016 21:17,Business,Fort Pierce,Fort Pierce,5.1,Meal/Entertain
1,1/2/2016 1:25,1/2/2016 1:37,Business,Fort Pierce,Fort Pierce,5.0,
2,1/2/2016 20:25,1/2/2016 20:38,Business,Fort Pierce,Fort Pierce,4.8,Errand/Supplies
3,1/5/2016 17:31,1/5/2016 17:45,Business,Fort Pierce,Fort Pierce,4.7,Meeting
4,1/6/2016 14:42,1/6/2016 15:49,Business,Fort Pierce,West Palm Beach,63.7,Customer Visit
...,...,...,...,...,...,...,...
1151,12/31/2016 13:24,12/31/2016 13:42,Business,Kar?chi,Unknown Location,3.9,Temporary Site
1152,12/31/2016 15:03,12/31/2016 15:38,Business,Unknown Location,Unknown Location,16.2,Meeting
1153,12/31/2016 21:32,12/31/2016 21:50,Business,Katunayake,Gampaha,6.4,Temporary Site
1154,12/31/2016 22:08,12/31/2016 23:51,Business,Gampaha,Ilukwatta,48.2,Temporary Site


In [3]:
dataset_df.columns

Index(['START_DATE*', 'END_DATE*', 'CATEGORY*', 'START*', 'STOP*', 'MILES*',
       'PURPOSE*'],
      dtype='object')

## Data Cleanup

In [4]:
dataset_df = dataset_df.drop(["CATEGORY*", "PURPOSE*"], axis=1)

In [5]:
dataset_df.drop(dataset_df.tail(1).index,inplace=True) # drop last row

# Add additional mock columns

In [20]:
PLACE_TYPE = [
    "RESTAURANT",
    "BAR",
    "GYM",
    "OFFICE",
    "PLAY ARENA",
    "SHOPPING",
    "OTHERS"
]

In [21]:
random_place_type_arr = np.random.choice(PLACE_TYPE, 1155)

In [22]:
dataset_df['PLACE_TYPE*'] = pd.Series(random_place_type_arr)
dataset_df

Unnamed: 0,START_DATE*,END_DATE*,START*,STOP*,MILES*,PLACE_TYPE*
0,1/1/2016 21:11,1/1/2016 21:17,Fort Pierce,Fort Pierce,5.1,BAR
1,1/2/2016 1:25,1/2/2016 1:37,Fort Pierce,Fort Pierce,5.0,OTHERS
2,1/2/2016 20:25,1/2/2016 20:38,Fort Pierce,Fort Pierce,4.8,OTHERS
3,1/5/2016 17:31,1/5/2016 17:45,Fort Pierce,Fort Pierce,4.7,SHOPPING
4,1/6/2016 14:42,1/6/2016 15:49,Fort Pierce,West Palm Beach,63.7,RESTAURANT
...,...,...,...,...,...,...
1150,12/31/2016 1:07,12/31/2016 1:14,Kar?chi,Kar?chi,0.7,RESTAURANT
1151,12/31/2016 13:24,12/31/2016 13:42,Kar?chi,Unknown Location,3.9,SHOPPING
1152,12/31/2016 15:03,12/31/2016 15:38,Unknown Location,Unknown Location,16.2,SHOPPING
1153,12/31/2016 21:32,12/31/2016 21:50,Katunayake,Gampaha,6.4,RESTAURANT


In [23]:
# check distribution of values
d = {}
l = dataset_df['PLACE_TYPE*'].to_list()
for i in l:
    if i not in d:
        d[i]=1
    else:
        d[i]+=1
d

{'BAR': 161,
 'GYM': 161,
 'OFFICE': 153,
 'OTHERS': 177,
 'PLAY ARENA': 164,
 'RESTAURANT': 178,
 'SHOPPING': 161}

In [24]:
# 50 random users
USERS = [
    "Erika Brown",
    "John Peters",
    "Mark Rodriguez DDS",
    "Alexandra Lewis",
    "Michael Mckinney",
    "David Whitaker",
    "Steven Jones",
    "Patricia Marshall",
    "Rachel Williams",
    "Michelle Day",
    "John Medina",
    "Cynthia Campbell",
    "William Walker",
    "Kylie Gordon",
    "Margaret Brock",
    "Alexis Barry",
    "Richard Reid",
    "Kelly Torres",
    "Maria Tran",
    "David Hartman",
    "Heather Maxwell",
    "Amber Young",
    "Aaron Webb",
    "Nancy Brennan",
    "Heather Mcguire",
    "Brendan Rivera",
    "Elizabeth Gross",
    "David Rodriguez",
    "Samantha Coleman",
    "Courtney Jones",
    "Tracy Hanna",
    "Paul Smith",
    "Traci Braun",
    "Rose Cruz",
    "Ryan Barnes",
    "Sophia Hernandez",
    "David Patel",
    "Alexis Wang",
    "Jeremy Price",
    "William Jennings",
    "Sarah Peck",
    "Lance Chan",
    "Troy Stewart",
    "Alexandria Barrett",
    "George Thomas Jr.",
    "Chad Davis",
    "Wesley Wilson",
    "Lynn Elliott",
    "Matthew Russell",
    "Nicole Garrett"
]

In [25]:
random_users_arr = np.random.choice(USERS, 1155)

In [27]:
dataset_df['USER*'] = pd.Series(random_users_arr)
dataset_df

Unnamed: 0,START_DATE*,END_DATE*,START*,STOP*,MILES*,PLACE_TYPE*,USER*
0,1/1/2016 21:11,1/1/2016 21:17,Fort Pierce,Fort Pierce,5.1,BAR,Courtney Jones
1,1/2/2016 1:25,1/2/2016 1:37,Fort Pierce,Fort Pierce,5.0,OTHERS,Traci Braun
2,1/2/2016 20:25,1/2/2016 20:38,Fort Pierce,Fort Pierce,4.8,OTHERS,Margaret Brock
3,1/5/2016 17:31,1/5/2016 17:45,Fort Pierce,Fort Pierce,4.7,SHOPPING,William Walker
4,1/6/2016 14:42,1/6/2016 15:49,Fort Pierce,West Palm Beach,63.7,RESTAURANT,Jeremy Price
...,...,...,...,...,...,...,...
1150,12/31/2016 1:07,12/31/2016 1:14,Kar?chi,Kar?chi,0.7,RESTAURANT,Amber Young
1151,12/31/2016 13:24,12/31/2016 13:42,Kar?chi,Unknown Location,3.9,SHOPPING,Rose Cruz
1152,12/31/2016 15:03,12/31/2016 15:38,Unknown Location,Unknown Location,16.2,SHOPPING,Mark Rodriguez DDS
1153,12/31/2016 21:32,12/31/2016 21:50,Katunayake,Gampaha,6.4,RESTAURANT,William Walker


In [28]:
# check distribution of values
d = {}
l = dataset_df['USER*'].to_list()
for i in l:
    if i not in d:
        d[i]=1
    else:
        d[i]+=1
d

{'Aaron Webb': 21,
 'Alexandra Lewis': 20,
 'Alexandria Barrett': 18,
 'Alexis Barry': 18,
 'Alexis Wang': 28,
 'Amber Young': 24,
 'Brendan Rivera': 22,
 'Chad Davis': 25,
 'Courtney Jones': 21,
 'Cynthia Campbell': 15,
 'David Hartman': 21,
 'David Patel': 23,
 'David Rodriguez': 14,
 'David Whitaker': 22,
 'Elizabeth Gross': 14,
 'Erika Brown': 26,
 'George Thomas Jr.': 19,
 'Heather Maxwell': 25,
 'Heather Mcguire': 31,
 'Jeremy Price': 21,
 'John Medina': 22,
 'John Peters': 22,
 'Kelly Torres': 35,
 'Kylie Gordon': 24,
 'Lance Chan': 19,
 'Lynn Elliott': 19,
 'Margaret Brock': 26,
 'Maria Tran': 15,
 'Mark Rodriguez DDS': 28,
 'Matthew Russell': 22,
 'Michael Mckinney': 23,
 'Michelle Day': 18,
 'Nancy Brennan': 34,
 'Nicole Garrett': 21,
 'Patricia Marshall': 27,
 'Paul Smith': 24,
 'Rachel Williams': 22,
 'Richard Reid': 29,
 'Rose Cruz': 22,
 'Ryan Barnes': 23,
 'Samantha Coleman': 31,
 'Sarah Peck': 20,
 'Sophia Hernandez': 28,
 'Steven Jones': 19,
 'Traci Braun': 18,
 'Tracy

In [33]:
dataset_df.to_csv('Synthetic User Trips Data.csv', index=False)

In [34]:
# verify the dataset
pd.read_csv("Synthetic User Trips Data.csv")

Unnamed: 0,START_DATE*,END_DATE*,START*,STOP*,MILES*,PLACE_TYPE*,USER*
0,1/1/2016 21:11,1/1/2016 21:17,Fort Pierce,Fort Pierce,5.1,BAR,Courtney Jones
1,1/2/2016 1:25,1/2/2016 1:37,Fort Pierce,Fort Pierce,5.0,OTHERS,Traci Braun
2,1/2/2016 20:25,1/2/2016 20:38,Fort Pierce,Fort Pierce,4.8,OTHERS,Margaret Brock
3,1/5/2016 17:31,1/5/2016 17:45,Fort Pierce,Fort Pierce,4.7,SHOPPING,William Walker
4,1/6/2016 14:42,1/6/2016 15:49,Fort Pierce,West Palm Beach,63.7,RESTAURANT,Jeremy Price
...,...,...,...,...,...,...,...
1150,12/31/2016 1:07,12/31/2016 1:14,Kar?chi,Kar?chi,0.7,RESTAURANT,Amber Young
1151,12/31/2016 13:24,12/31/2016 13:42,Kar?chi,Unknown Location,3.9,SHOPPING,Rose Cruz
1152,12/31/2016 15:03,12/31/2016 15:38,Unknown Location,Unknown Location,16.2,SHOPPING,Mark Rodriguez DDS
1153,12/31/2016 21:32,12/31/2016 21:50,Katunayake,Gampaha,6.4,RESTAURANT,William Walker
