In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
from datetime import datetime
import pickle
import os
import re
import csv



In [2]:
df = pd.read_csv(os.path.join(os.getcwd(), 'data/robocall.csv'))

In [3]:
## Preprocess Time of Issue to consistent format

def preprocessTime(s):
    if isinstance(s, float):
        return s
    s = re.sub(r'[^a-zA-Z\d:]', '', s.lower())
    condition = re.sub(r'[^:]*:[^\D]*', '', s)
    if 2 != len(condition):
        if 'a' in condition:
            s = re.match(r'[^:]*:[^\D]*', s).group(0)+'am'
        else:
            s = re.match(r'[^:]*:[^\D]*', s).group(0)+'pm'
    return s

df['Time of Issue'] = df['Time of Issue'].apply(preprocessTime)

In [4]:
## Dropped all row entries with no provided location...
## Asserts dropped data is insignificant information loss (assumed threshold 5%)

assert len(df[df['Location (Center point of the Zip Code)'].isnull()])/len(df) < 0.05
df = df.dropna(subset = ['Location (Center point of the Zip Code)'])

In [5]:
## Generate Location dictionary with (zip code, location coordinate) pairs
## Assumes df contains column 'Location (Center point of the Zip Code)'

def getLocations(df, d = dict()):
    splitter = lambda x: tuple(x.split('\n'))
    for entry in set(df['Location (Center point of the Zip Code)'].apply(splitter)):
        if len(entry) == 2:
            k, v = entry
            d[k.split('-')[0].split(' ')[1]] = literal_eval(v)
    return d

locationDict = getLocations(df)

In [6]:
## Converts coordinates to respective timezone from UTC in seconds
## Takes about ten minutes if timeZoneDict.pickle doesn't exist

if not os.path.isfile("timeZoneDict.pickle"):
    ## Requires timezonefinder (if we deploy to application, we'll add this to requirements.txt)
    ! pip install timezonefinder
    from timezonefinder import TimezoneFinder
    from pytz import timezone
    import pytz
    
    tf = TimezoneFinder()
    utc = pytz.utc

    def offset(coordinates):
        latitude, longitude = coordinates
        target = dict({'lat':latitude, 'lng':longitude})
        today = datetime.now()
        tz_target = timezone(tf.certain_timezone_at(lat=target['lat'], lng=target['lng']))
        today_target = tz_target.localize(today)
        today_utc = utc.localize(today)
        return (today_utc - today_target).total_seconds()

    timeZoneDict = {k: offset(v) for k, v in locationDict.items()}
    serialized = open("timeZoneDict.pickle", "wb")
    pickle.dump(timeZoneDict, serialized)
    serialized.close()
else:
    deserialized = open("timeZoneDict.pickle","rb")
    timeZoneDict = pickle.load(deserialized)

In [7]:
## Converts coordinates to area code
## Takes about ten minutes if areaCodeDict.pickle doesn't exist

if not os.path.isfile("areaCodeDict.pickle"):
    def geocoordinateDistance(origin, destination):
        #Computes distance between 2 geo coordinate points
        #Taken from https://gist.github.com/rochacbruno/2883505
        #Author: Wayne Dyck
        from math import radians, sin, cos, atan2, sqrt
        lat1, lon1 = origin
        lat2, lon2 = destination
        radius = 6371 # km

        dlat = radians(lat2-lat1)
        dlon = radians(lon2-lon1)
        a = sin(dlat/2) * sin(dlat/2) + cos(radians(lat1)) \
            * cos(radians(lat2)) * sin(dlon/2) * sin(dlon/2)
        c = 2 * atan2(sqrt(a), sqrt(1-a))
        d = radius * c

        return d

    def findAreaCode(coordinates):
        areaCode = pd.read_csv(os.path.join(os.getcwd(),
            'data/area-code-to-lat-long.csv'))
        areaCode['Loc'] = list(zip(areaCode.Lat, areaCode.Long))
        areaCode['Distance'] = areaCode['Loc'].apply(lambda x:
            geocoordinateDistance(x, coordinates))
        return areaCode.iloc[areaCode['Distance'].idxmin()]['Area Code']

    areaCodeDict = {k: findAreaCode(v) for k,v in locationDict.items()}
    serialized2 = open("areaCodeDict.pickle", "wb")
    pickle.dump(areaCodeDict, serialized2)
    serialized2.close()
else:
    deserialized2 = open("areaCodeDict.pickle","rb")
    areaCodeDict = pickle.load(deserialized2)

In [8]:
if not os.path.isfile("areaCodeToLatLongPickle.pickle"):
    areaCodeToLocation = {}
    with open('data/area-code-to-lat-long.csv', 'r') as f:
        reader = csv.reader(f)
        next(reader)
        for row in reader:
            [area_code, lat, _long] = row
            areaCodeToLocation[area_code] = (float(lat), float(_long))
        f.close()

    with open("areaCodeToLatLongPickle", "wb+") as f:
        pickle.dump(areaCodeToLocation, f)
        f.close()
else:
    deserialized2 = open("areaCodeToLatLongPickle.pickle","rb")
    areaCodeToLocation = pickle.load(deserialized2)

In [9]:
## Preprocess Location and Zip to consistent format
## Converts Ticket Created to datetime object in df master dataframe
## Fill some NaN phone numbers so that 'Caller ID Number' accepts defined entry in 'Advertiser Number'
## Remove 'Accessibility', 'Emergency', 'Request for Dispute Assistance' forms (incomplete data)

def preprocessLocation(s):
    s = re.sub(r'(\d)\(', r'\1\n(', s)
    return s.split('\n')[0].split(' ')[-1].split('-')[0]

def location(zipCode):
    try:
        return locationDict[zipCode]
    except KeyError:
        return np.nan

def convertAreaCode(areaCode):
    try:
        return areaCodeToLocation[areaCode]
    except KeyError:
        return np.nan

df = df.rename(index = str, columns = {'Location (Center point of the Zip Code)': 'Location (Target)'})
df['Zip'] = df['Location (Target)'].apply(preprocessLocation)
df['Location (Target)'] = df['Location (Target)'].apply(lambda x: location(preprocessLocation(x)))
df = df.dropna(subset = ['Location (Target)'])
df['Ticket Created'] = df['Ticket Created'].apply(lambda x: datetime.strptime(x[:-6], "%m/%d/%Y %I:%M:%S %p"))

subset = df[['Caller ID Number', 'Advertiser Business Number']].fillna('')
df['Caller ID Number'] = subset['Caller ID Number'].combine(
    subset['Advertiser Business Number'], max).replace('', np.nan, regex=True)

insignificant = ['Accessibility', 'Emergency', 'Request for Dispute Assistance']
assert df[df.Form.isin(insignificant)].groupby('Form')['Ticket ID'].nunique().sum()/len(df) < 0.05
df = df[~df.Form.isin(insignificant)]

df['Area Code (Target)'] = df['Zip'].apply(lambda x: areaCodeDict[x])
df['Area Code (Source)'] = df['Caller ID Number'].apply(lambda x: x.split('-')[0] if isinstance(x, str) else x).replace('None', np.nan)

## Removes Internet data given lack of available information per feature
df = df[~(df.Form == 'Internet')]

## Adds location coordinates for source of spam call
df['Location (Source)'] = df['Area Code (Source)'].apply(convertAreaCode)

In [10]:
## Some columns with predominately null values under given form. Should be noted.

from itertools import product

def computePercentageOfNull(form, columnName, data = df):
    return len(data[data.Form == form][columnName].dropna())/len(data[data.Form == form][columnName])

for form, column in product(set(df.Form), df.columns):
    percentage = computePercentageOfNull(form, column)
    if percentage < 0.05:
        print('The column {} under form {} contains {}% defined row entries.'.format(column, form, 100*percentage))

The column Type of Call or Messge under form TV contains 0.0% defined row entries.
The column Area Code (Source) under form TV contains 0.0% defined row entries.
The column Location (Source) under form TV contains 0.0% defined row entries.
The column Type of Call or Messge under form Radio contains 0.0% defined row entries.
The column Area Code (Source) under form Radio contains 0.0% defined row entries.
The column Location (Source) under form Radio contains 0.0% defined row entries.


In [11]:
%%capture

## Assumes 80-20 train-test split under seed value 8888 for reproducability of results
## Creates Issue DateTime column in data and fixes misrecorded dates of issue
## Assumes all reports are submitted within a day of receiving spam call

from datetime import datetime, timedelta

def convert(x):
    return x['Issue DateTime'] - timedelta(seconds = x['Offset'])  

def computeTimeDifference(row):
    try:
        return row["Ticket Created"] - row["Issue DateTime UTC"] 
    except ValueError as e:
        if "Out of bounds nanosecond timestamp:" in str(e):
            return np.nan

data = df.dropna(subset = ['Date of Issue', 'Time of Issue'])
data["Issue DateTime"] = (data["Date of Issue"] + ' ' + data["Time of Issue"]).apply(
    lambda x: datetime.strptime(x, "%m/%d/%Y %I:%M%p"))
data['Offset'] = data['Zip'].apply(lambda x: timeZoneDict[x])
data['Issue DateTime UTC'] = data[['Issue DateTime', 'Offset']].apply(convert, axis = 1)

columns = ["Ticket ID", "Ticket Created", "Issue DateTime UTC"]
data['Time Elapsed'] = data[columns].apply(computeTimeDifference, axis = 1)

data['Issue DateTime UTC'] = data['Ticket Created'] - (data['Time Elapsed'] - 
    pd.to_timedelta(data['Time Elapsed'].dt.days, unit='d'))


In [13]:
serialized3 = open("df.pickle", "wb")
pickle.dump(df, serialized3)
serialized3.close()

serialized4 = open("modeldf.pickle", "wb")
pickle.dump(data, serialized4)
serialized4.close()