In [1]:
## Loads data into Jupyter session from pickled file

import pickle

deserialized = open("modeldf.pickle","rb")
df = pickle.load(deserialized)

In [2]:
## Assumes 80-20 split under seed 8888 for reproducibility
## Split data into test and training sets

from random import shuffle, seed

def dataSplit(data, testPercent = 0.2, s = 8888):
    seed(s)
    identifiers = list(range(len(data)))
    shuffle(identifiers)
    stop = int(len(data) * testPercent)
    trainIds, testIds = identifiers[stop:], identifiers[:stop]
    return data.iloc[trainIds], data.iloc[testIds]

train, test = dataSplit(df)

In [3]:
train.head()

Unnamed: 0,Ticket ID,Ticket Created,Date of Issue,Time of Issue,Form,Method,Issue,Caller ID Number,Type of Call or Messge,Advertiser Business Number,...,State,Zip,Location (Target),Area Code (Target),Area Code (Source),Location (Source),Issue DateTime,Offset,Issue DateTime UTC,Time Elapsed
45170,1173633,2016-09-02 00:58:37,09/01/2016,7:46pm,Phone,Wireless (cell phone/other mobile device),Robocalls,949-945-2138,Prerecorded Voice,,...,CT,6830,"(41.000947, -73.656421)",914,949.0,"(33.573486153846, -117.73371615385)",2016-09-01 19:46:00,-14400.0,2016-09-01 23:46:00,01:12:37
175561,100741,2015-01-26 20:14:22,01/26/2015,11:30am,Phone,Wired,Telemarketing (including do not call and spoof...,818-666-8081,Live Voice,818-666-8081,...,IA,52404,"(41.935555, -91.691484)",319,818.0,"(34.186115, -118.43554333333)",2015-01-26 11:30:00,-18000.0,2015-01-26 16:30:00,03:44:22
1070318,2517587,2018-05-23 20:52:04,05/23/2018,1:49pm,Phone,Wired,Unwanted Calls,,Abandoned Calls,916-233-1935,...,CA,95758,"(38.424597, -121.423087)",916,,,2018-05-23 13:49:00,-25200.0,2018-05-23 20:49:00,00:03:04
484910,693520,2015-12-06 02:15:42,12/05/2015,7:45pm,TV,,Loud Commercials,,,,...,NY,14094,"(43.157609, -78.667129)",716,,,2015-12-05 19:45:00,-14400.0,2015-12-05 23:45:00,02:30:42
585677,903066,2016-04-09 00:08:25,04/08/2016,4:00pm,Phone,Wireless (cell phone/other mobile device),Telemarketing (including do not call and spoof...,617-792-5883,Prerecorded Voice,,...,CA,94303,"(37.442789, -122.129574)",650,617.0,"(42.348581818182, -71.100127272727)",2016-04-08 16:00:00,-25200.0,2016-04-08 23:00:00,01:08:25


In [84]:
## Data Visualization dependencies

import xarray as xr
import numpy as np
import pandas as pd
import holoviews as hv
import geoviews as gv
import geoviews.feature as gf

import cartopy
from cartopy import crs as ccrs

from bokeh.models.annotations import Title
from bokeh.tile_providers import STAMEN_TONER
from bokeh.models import WMTSTileSource
import geoviews.tile_sources as gts
from geoviews import dim, opts

pd.options.mode.chained_assignment = None
hv.notebook_extension('bokeh')




In [6]:
## Data Visualization for Location of Targets of Spam Calls

def mapDataInput(data, column, value, sourceQ = False):
    data['Identifier'] = data[column].apply(lambda x: 1 if x == value else 0)
    data['Year'] = data['Issue DateTime UTC'].apply(lambda x: x.year)
    if not sourceQ:
        data['Latitude'] = data['Location (Target)'].apply(lambda x: x[0])
        data['Longitude'] = data['Location (Target)'].apply(lambda x: x[1])
    else:
        data['Latitude'] = data['Location (Source)'].apply(lambda x: x[0])
        data['Longitude'] = data['Location (Source)'].apply(lambda x: x[1])
    return gv.Dataset(data, kdims=['Identifier', 'Latitude', 'Longitude','Year'])


dataInput = mapDataInput(train.sample(frac=0.1, replace=False, random_state=1), 'Form', 'Phone')

In [8]:
## Attempt to reorganize code structure without using Jupyter notebook Cell Magic (TODO)
dataInput = mapDataInput(train.sample(frac=0.05, replace=False, random_state=1), 'Form', 'Phone')

In [106]:
## Layout for map visualization (Still needs to be integrated into map visualization)

tiles = {'Wikipedia': WMTSTileSource(url='https://maps.wikimedia.org/osm-intl/{Z}/{X}/{Y}@2x.png')}

opts = {'Points' : dict(width=650, 
                        height=350, 
                        size=0.5, 
                        cmap='viridis', 
                        tools=['hover'], 
                        size_index=2, 
                        color_index=2,
                        xaxis=None,
                        yaxis=None)}

(hv.NdLayout({name: gv.WMTS(wmts, extents=(-70, 20, -50, 30)) for name, wmts in tiles.items()}, kdims=['Source'])
 .options(title_format="Targeted Locations of Spam Calls by Lat-Long Coordinates") *
 dataInput.to(gv.Points, kdims=['Longitude', 'Latitude'],
              vdims=['Identifier', 'State'], crs=ccrs.PlateCarree())).options(opts)



In [10]:
### Data Visualization of Source of Spam Call Aggregated by Area Code

source = train.dropna(subset=['Location (Source)'])
source['Year'] = source['Issue DateTime UTC'].apply(lambda x: x.year)
source = source[['Year', 'Location (Source)', 'Ticket ID']].groupby(['Year', 'Location (Source)']).count().reset_index()

source['Latitude'] = source['Location (Source)'].apply(lambda x: x[0])
source['Longitude'] = source['Location (Source)'].apply(lambda x: x[1])
source = source.rename(index=str, columns={"Ticket ID": "Frequency"})

freq = gv.Dataset(source, kdims=['Year'])

In [107]:
tiles = {'Wikipedia': WMTSTileSource(url='https://maps.wikimedia.org/osm-intl/{Z}/{X}/{Y}@2x.png')}

opts = {'Points' : dict(width=650, 
                        height=350, 
                        size=0.25, 
                        cmap='viridis', 
                        tools=['hover'], 
                        size_index=2, 
                        color_index=2,
                        xaxis=None,
                        yaxis=None)}

(hv.NdLayout({name: gv.WMTS(wmts, extents=(-70, 20, -50, 30)) for name, wmts in tiles.items()}, kdims=['Source'])
 .options(title_format="Source Locations of Spam Calls by Frequency Aggregated By Area Code") *\
freq.to(gv.Points, kdims=['Longitude', 'Latitude'],
              vdims=['Frequency'], crs=ccrs.PlateCarree())).options(opts)