In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tag import pos_tag
from nltk.tokenize import WordPunctTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split,cross_val_score
from collections import Counter
import json

from encryptedpickle import encryptedpickle
import pickle
from facepy import GraphAPI
import credentials
from geotext import GeoText

import datetime

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
graph = GraphAPI(credentials.fbtoken)

In [3]:
content = graph.get('/10153515548171271/feed')

In [20]:
content

{u'data': [{u'actions': [{u'link': u'https://www.facebook.com/10153515548171271/posts/10154538990696271',
     u'name': u'Comment'},
    {u'link': u'https://www.facebook.com/10153515548171271/posts/10154538990696271',
     u'name': u'Like'}],
   u'application': {u'category': u'Entertainment',
    u'id': u'124024574287414',
    u'link': u'https://www.instagram.com/',
    u'name': u'Instagram',
    u'namespace': u'instapp'},
   u'caption': u'Instagram photo by Aaron Kim \u2022 Nov 1, 2016 at 5:42am UTC',
   u'created_time': u'2016-11-01T05:42:35+0000',
   u'description': u"Tagged along for a Halloween hike w the good folks at Christfit. Good way to start the week! But it's already November. Where'd the year go??",
   u'from': {u'id': u'10153515548171271', u'name': u'Aaron Kim'},
   u'icon': u'https://www.facebook.com/images/icons/post.gif',
   u'id': u'10153515548171271_10154538990696271',
   u'is_expired': False,
   u'is_hidden': False,
   u'likes': {u'data': [{u'id': u'1010367704338743

In [4]:
# Write retrieved data to pickle file

encoder = encryptedpickle.EncryptedPickle(signature_passphrases=credentials.passphrases,
                                         encryption_passphrases=credentials.passphrases)

encryption = {
    # Add new encryption algorithm specification with id = 255.
    # Default algorithms can not be overridden so we must use some other
    # id, maybe best starting with 255 (maximum id) and decreasing by one
    # for next added algorithm.
    255: {
        # Algorithm name defined in EncryptedPickle.ALGORITHMS.
        'algorithm': 'aes-256-cbc',

        # Salt size for PBKDF2 key.
        'salt_size': 32,

        # Digest mode for PBKDF2 key.
        'pbkdf2_algorithm': 'sha256',

        # Use 10 iterations in PBKDF2 key generation.
        'pbkdf2_iterations': 10,
    },
}

encoder.set_algorithms(encryption=encryption)



In [5]:
sealed = encoder.seal(content)

In [None]:
print("* sealed: %s" % sealed)

In [10]:
with open('fbfeed.p','wb') as fp:
    pickle.dump(sealed,fp)

In [5]:
# Loading pickle data from file (if necessary)

with open('fbfeed.p','rb') as fp:
    sealedcontent = pickle.load(fp)

unsealed = encoder.unseal(sealedcontent)

In [6]:
type(unsealed)

dict

In [10]:
locations = {}

messages = []

for post in unsealed['data']:
    try:
        print 'Title:%s'%post['name']
        print 'Time:%s'%post['created_time']
        print 'Message:%s'%post['message']
        print 'Location:%s'%post['place']['location']
        locations.update({post['created_time']:post['place']['location']})
    except:
        pass
    print "-------"


Title:Instagram photo by Aaron Kim • Nov 1, 2016 at 5:42am UTC
Time:2016-11-01T05:42:35+0000
Message:Tagged along for a Halloween hike w the good folks at Christfit. Good way to start the week! But it's already November. Where'd the year go??
Location:{'city': 'Altadena', 'zip': '91001', 'country': 'United States', 'longitude': -118.13054080357, 'state': 'CA', 'latitude': 34.204032626371}
-------
Title:Instagram photo by Aaron Kim • Oct 29, 2016 at 9:13pm UTC
Time:2016-10-29T21:17:43+0000
Message:Was looking for a costume but took a detour and got distracted. 😬
Location:{'city': 'Los Angeles', 'zip': '90012', 'country': 'United States', 'longitude': -118.24051062316, 'state': 'CA', 'street': '134 Japanese Village Plaza, Bldg E', 'latitude': 34.048925395375}
-------
-------
Title:Kmovie - JBG Pictures USA
Time:2016-10-25T07:35:30+0000
Message:Lawls. Anyone up to catch this tomorrow night? Limited engagement (tomorrow's the last day at CGV LA). 9:45 show. HMU.
-------
Title:The Best Frie

In [17]:
locations

{'2016-10-29T21:17:43+0000': {'city': 'Los Angeles',
  'country': 'United States',
  'latitude': 34.048925395375,
  'longitude': -118.24051062316,
  'state': 'CA',
  'street': '134 Japanese Village Plaza, Bldg E',
  'zip': '90012'},
 '2016-11-01T05:42:35+0000': {'city': 'Altadena',
  'country': 'United States',
  'latitude': 34.204032626371,
  'longitude': -118.13054080357,
  'state': 'CA',
  'zip': '91001'}}

In [47]:
fblocdata = []

for timestamp in locations.keys():
    columns = locations[timestamp].keys() if len(locations[timestamp].keys()) > len(columns) else columns
    
    fbdataholder= []
    for title in columns:
        try:
            fbdataholder.append(locations[timestamp][title])
        except:
            fbdataholder.append('nan')
    
    fblocdata.append(fbdataholder)

fblocdf = pd.DataFrame(fblocdata,columns=columns)
    
print 'Given the available Facebook post data, the following location data is available:'
fblocdf

Given the available Facebook post data, the following location data is available:


Unnamed: 0,city,zip,country,longitude,state,street,latitude
0,Los Angeles,90012,United States,-118.240511,CA,"134 Japanese Village Plaza, Bldg E",34.048925
1,Altadena,91001,United States,-118.130541,CA,,34.204033


# ==== LOCATION BASED CHALLENGE (LBC) ====

The concept of this approach is to take user location data (if available) and try to build a challenge option based on the location data available.

Google maps user location history is enabled by default and running in the background, so it's reasonable to assume this data is readily available.

Some things to consider when deciding to include an LBC are:
- Date of last location : for now, the null hypothesis/assumption about the user's ability to retain their own location information is that they will be able to recall where they were with reasonable accuracy for up to three days worth of history. This assumption will need to be tested and will need to be updated accordingly in effort to improve the location recollection success rate over time.
- If the data is a pair of geographic coordinates but no specific business/name reference, is there user history with reference to any businesses or other frequent markers that can be tied to the user? 
    - eg: have a pair of coordinates for the corner of Washington and Hill. There is a subway, chinese restaurant, mcdonalds, courthouse, and maker city LA/General Assembly. Is there user history to suggest which one (or more) of these were visited by the user? 

## Source : Google Maps Data

In [48]:
g_loc_hist_raw = pd.read_json('LocationHistory.json')

In [49]:
gmaps_columns = ['timestamp','lat','lng','acc','vel','heading','altitude','v_acc']

In [50]:
gmaps_coord_disp = 10.**7

In [51]:
def pandify_gloc_data(data, columns, geodisp = 10.**7):
    loc_hist = []
        
    for datapoint in data:
        try:
            timestamp = datetime.datetime.fromtimestamp((int(datapoint['timestampMs'])/1000)).strftime('%Y-%m-%d %H:%M:%S')
        except:
            timestamp = ''
        try:
            lat = datapoint['latitudeE7']/(geodisp)
        except:
            lat = '0'
        try:
            lng = datapoint['longitudeE7']/(geodisp)
        except:
            lng = '0'
        try:
            alt = datapoint['altitude']
        except:
            alt = '0'
        try:
            v_acc = datapoint['verticalAccuracy']
        except:
            v_acc = '0'
        try:
            vel = datapoint['velocity']
        except:
            vel = '0'
        try:
            heading = datapoint['heading']
        except:
            heading = '0'
        try:
            acc = datapoint['accuracy']
        except:
            acc = '0'

        loc_hist.append([timestamp, lat, lng, acc, vel, heading, alt, v_acc])
    
    
    return pd.DataFrame(loc_hist, columns=columns)
    

In [52]:
gloc_hist = pandify_gloc_data(g_loc_hist_raw['locations'],gmaps_columns,gmaps_coord_disp)

In [56]:
gloc_hist.head(10)

Unnamed: 0,timestamp,lat,lng,acc,vel,heading,altitude,v_acc
0,2016-10-30 15:10:58,34.066658,-118.314842,10,12,333,67,4
1,2016-10-30 15:09:45,34.066535,-118.314814,43,0,0,0,0
2,2016-10-30 15:08:16,34.062415,-118.31419,10,15,3,61,4
3,2016-10-30 15:06:07,34.052689,-118.309767,5,15,270,60,12
4,2016-10-30 15:04:00,34.052777,-118.29752,10,2,270,98,32
5,2016-10-30 15:03:40,34.05299,-118.296021,29,0,0,0,0
6,2016-10-30 13:45:19,34.052913,-118.296064,65,0,0,92,48
7,2016-10-30 13:23:02,34.053039,-118.296075,40,0,0,0,0
8,2016-10-30 13:21:27,34.052142,-118.287942,10,10,266,68,4
9,2016-10-30 13:17:39,34.0591,-118.282748,10,6,271,86,4


Based on the following degree precision information (Source: Wikipedia, https://en.wikipedia.org/wiki/Decimal_degrees), I can try grouping the points of interest by first 5 decimal places, then 4 if necessary. The purpose of this is to generate a count of possible repeat location measurements at particular locations. Locations with higher repeat measurements may be locations of particular interest to the user. Combined with the timestamp, this may provide meaningful information for LBC generation.

![title](gps-resolution.png)

## Source: Facebook/Instagram

In [141]:
locations = {}

messages = []
fb_loc = []

for post in unsealed['data']:
    try:
        fb_loc.append([post['name'],post['created_time'],post['place']['location']])
    except:
        pass


In [142]:
fb_loc

[[u'Instagram photo by Aaron Kim \u2022 Nov 1, 2016 at 5:42am UTC',
  '2016-11-01T05:42:35+0000',
  {'city': 'Altadena',
   'country': 'United States',
   'latitude': 34.204032626371,
   'longitude': -118.13054080357,
   'state': 'CA',
   'zip': '91001'}],
 [u'Instagram photo by Aaron Kim \u2022 Oct 29, 2016 at 9:13pm UTC',
  '2016-10-29T21:17:43+0000',
  {'city': 'Los Angeles',
   'country': 'United States',
   'latitude': 34.048925395375,
   'longitude': -118.24051062316,
   'state': 'CA',
   'street': '134 Japanese Village Plaza, Bldg E',
   'zip': '90012'}]]

## === Question : How to tie in location data across sources to build an LBC? ===

Algorithm angles of attack:
1. Correlation between posts with location data vs. all posts
    - With what frequency (or "importance") does the user tie-in location data to their digital footprint?
        - Check-ins, place mentions, instagram posts with tags, maps history frequencies, etc. 

# =============== END LBC ================

In [16]:
user_features = TfidfVectorizer(stop_words='english',
                               strip_accents='ascii')

In [17]:
X_features = user_features.fit_transform(messages)

In [19]:
#user_features.vocabulary_

## === LBC Generation ===

This is a rough implementation of the Location Based Challenge module for SG1. 

Expected final implementation will be a LBC class which will include measures to destroy the instance after use or upon expiration so that data does not reside in memory. 

In [None]:
# Password tokenization

def tokemon():
    
    
    
    return token

In [None]:
# Version 0.1 - Basic LBC generation based on assumed data from above. 

# No parameters
# Data is assumed to inherently exist from processing/model applications above.

# Function will return LBC with hint if available along with randomized token for answer tokenization. 

# Secondary function will take LBC challenge answer and token and repond with authentication response. 


def genlbc():
    
    challenge = ["This will be the query.", "This would be a hint if available"]
    token = 
    
    return challenge,token

In [None]:
def passlbc(answer,token):
    pass
    # return False
    
    # return True

## === LBC Tests ===