In [2]:
import scipy.spatial as spatial
import numpy as np
import re
import json
import codecs
from datetime import datetime
from nltk.corpus import stopwords
from dateutil import parser
from tqdm import tqdm

In [3]:
def RemoveStopWords(stopWords, text):
    return [w for w in text if w not in stopWords and len(w)>2]

class Tweet:
    def __init__(self, id, text, time, stopWords, bounding_box = None, coordinates = None):
        self.id = id
        self.text = RemoveStopWords(stopWords, 
                                    list(filter(None,re.split('[^a-z]',text.lower()))))
        self.time = parser.parse(str(time))
        if bounding_box == None:
            self.bounding_box = []
        else:
            self.bounding_box = bounding_box
        self.coordinates = coordinates

In [4]:
stopWords = set(stopwords.words('english'))
nonloc_values = [Tweet(json.loads(line)["_id"]["$numberLong"],
                       json.loads(line)["text"], 
                       json.loads(line)["created_at"],
                       stopWords)
                for line 
                in tqdm(open("actual_data/nogeo.json"))
                if json.loads(line)["lang"] == "en"
                    and json.loads(line)["in_reply_to_user_id"] == None 
                    and json.loads(line)["in_reply_to_status_id"] == None
                    and json.loads(line)["retweeted"] == False]
#nonloc_values = RemoveStopWords(stopWords, nonloc_values)

bbox_values = [Tweet(json.loads(line)["_id"]["$numberLong"],
                     json.loads(line)["text"], 
                     json.loads(line)["created_at"],
                     stopWords,
                bounding_box = json.loads(line)["place"]["bounding_box"]["coordinates"][0])
                for line 
                in tqdm(open("actual_data/bbox.json"))
                if json.loads(line)["lang"] == "en"
                    and json.loads(line)["in_reply_to_user_id"] == None 
                    and json.loads(line)["in_reply_to_status_id"] == None
                    and json.loads(line)["retweeted"] == False]
#bbox_values = RemoveStopWords(stopWords, bbox_values)

print("Data loaded. Number of nonloc is %d"%len(nonloc_values))
print("and number of bbox is %d"%len(bbox_values))

tweet_content = [value.text for value in tqdm(bbox_values + nonloc_values)]
#flatten the list of lists to 1d array
flatten_content = [item for sublist in tweet_content for item in sublist]
#remove duplicates
content_dict = {w:'' for w in flatten_content}
#enumerate without duplicates
content_enum = {w: idx for idx, w in enumerate(content_dict)}

print("Dictionary is generated. Number of words %d"%len(content_enum))

conjunction_matrix = np.zeros((len(bbox_values + nonloc_values), len(content_enum)), dtype=int) 
d = dict()
for idx, tweet in enumerate(tqdm(bbox_values + nonloc_values)):
    d[tweet.id] = idx
    for w in tweet.text:
        conjunction_matrix[idx,content_enum[w]] += 1.
        
print("Matrix is calculated. Shape is", conjunction_matrix.shape)

9721it [00:08, 1158.37it/s]
862it [00:00, 1486.39it/s]
100%|██████████| 7865/7865 [00:00<00:00, 1194921.61it/s]
 16%|█▌        | 1227/7865 [00:00<00:00, 12266.59it/s]

Data loaded. Number of nonloc is 7322
and number of bbox is 543
Dictionary is generated. Number of words 17375


100%|██████████| 7865/7865 [00:00<00:00, 10859.86it/s]

Matrix is calculated. Shape is (7865, 17375)





In [5]:
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
from datetime import timedelta
new_col = []
from lshash import lshash

lsh = lshash.LSHash(6, len(content_enum))

bbox_dict = {}

for bbox in bbox_values: 
    lsh.index(conjunction_matrix[d[bbox.id]], extra_data=bbox.id)
    bbox_dict[bbox.id] = bbox
    
#make a threshold for similarity
threshold = 0.9

for tweet in tqdm(nonloc_values):
    cs = lsh.query(conjunction_matrix[d[tweet.id]], num_results=10, distance_func='cosine')
    points = []
    boxes = []
    cs2 = []
    for m in cs:
        if m[1]<threshold:
            cs2.append([m[0][1], m[1]])
    for idx in cs2:
        tdelta = (bbox.time-tweet.time)/timedelta(minutes=1)
        #another threshold by time. Not more than a week
        if tdelta > 60*24*7: continue
        bbox = bbox_dict[idx[0]]
        points+=[(x, tdelta+0.0001, idx[1]+0.0001)
                 for x in bbox.bounding_box]
        boxes.append(Polygon(bbox.bounding_box))
    x0 = np.sum([x[0][0]*(1/x[1]+1/x[2]) for x in points])
    y0 = np.sum([x[0][1]*(1/x[1]+1/x[2]) for x in points])
    m0 = np.sum([1/x[1]+1/x[2] for x in points])
    coord_res = Point([x0/m0, y0/m0])
    for box in boxes:
        if box.contains(coord_res):
            tweet.bounding_box = box.exterior.coords
            new_col.append(tweet)
            break

print("We could recognize %f per cent tweets"%(len(new_col)*100.0/len(nonloc_values)))
    

100%|██████████| 7322/7322 [02:18<00:00, 52.89it/s]

We could recognize 55.544933 per cent tweets





In [136]:
exact_values = [Tweet(json.loads(line)["_id"]["$numberLong"].encode("utf-8"),
                filter(None,re.split('[^a-z]', 
                    json.loads(line)["text"].lower().encode("utf-8"))),
                coordinates = json.loads(line)["coordinates"]["coordinates"])
                for line 
                in open("actual_data/exact.json")]
exact_values = RemoveStopWords(stopWords, exact_values)

exact = [value.text for value in exact_values + bbox_values]
#flatten the list of lists to 1d array
exact_flatten = [item for sublist in exact for item in sublist]
#remove duplicates
exact_dict = {w:'' for w in exact_flatten}
#enumerate without duplicates
exact_enum = {w: idx for idx, w in enumerate(exact_dict)}

exact_matrix = np.zeros((len(exact_values+bbox_values), len(exact_enum)), dtype=int) 
d = dict()
for idx, tweet in enumerate(exact_values + bbox_values):
    d[tweet.id] = idx
    for w in tweet.text:
        exact_matrix[idx,exact_enum[w]] += 1

from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

from lshash import LSHash
i=0
for tweet in bbox_values:
    inside_tweets = [value for value in exact_values 
                     if Polygon(tweet.bounding_box).contains(
                     Point(value.coordinates))]
    lsh = LSHash(12, len(exact_enum))
    if len(inside_tweets)<3: continue
    for insider in inside_tweets: 
        lsh.index(exact_matrix[d[insider.id]])
    
    cs = lsh.query(exact_matrix[d[tweet.id]], num_results=3)
    points = []
    for insider in inside_tweets:
        if exact_matrix[d[insider.id]] in cs:
            points.append(insider.coordinates)
    if len(cs)!=0:
        i+=1
print i
print len(bbox_values)



13
753
