In [2]:
from pymongo import MongoClient
from bson.objectid import ObjectId


In [3]:
HOST = "localhost"
PORT = 27017
DB = "twitter_dutch"
COLLECTION = "tweet"


In [4]:
client = MongoClient(HOST, PORT)
db = client[DB]

### Retrieve all the tweets (actually the first 20..)

In [54]:
tweets = list(db[COLLECTION].find().limit(20))

### Query the database with a filter

In [5]:
tweets = list(db[COLLECTION].find({
    "lang":"en",
    "geo":{
        "$ne":None
    }
}))

### Adding an additional field

In [6]:
# Add text length
for tweet in tweets:
    str_length = len(tweet["text"])
    tweet["length"] = str_length

In [7]:
from nltk.tokenize import word_tokenize

# Add numbers of word
for tweet in tweets:
    words = word_tokenize(tweet["text"])
    word_number = len(words)
    tweet["word_number"] = word_number

In [9]:
# hasUrl?
import re
for tweet in tweets:
    
    url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+] |[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', tweet["text"]) 
    
    hasUrl = False
    if len(url)>0:
        hasUrl = True
    
    tweet["hasUrl"] = hasUrl


In [20]:
!pip install shapely

[33mYou are using pip version 19.0.3, however version 19.1.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [24]:
import json

with open("amsterdam-geojson.json") as f:
    features = json.load(f)["features"]

    

In [17]:
# Name of the area

from shapely.geometry import shape, Point


    
 
def find_location(tweet):

    # If the tweet does not have any geolocation information, simplu return the tweet
    if tweet["geo"] is None and tweet["place"] is None:
        return tweet

    # If the tweet has the coordinates, create a Point object
    point = None
    if tweet["geo"] is not None:
        point = Point(tweet["geo"]["coordinates"][1], tweet["geo"]["coordinates"][0])

    for a in features:
        area = shape(a["geometry"])

        # Assigning the area name if (1) the point is within the area or (2) the area name is the same 
        #as the one in the twitter data

        if (point is not None and area.contains(point)) or a["properties"]["name"] == tweet["place"]["name"]:
            return a["properties"]["name"]

    return None


In [18]:
for tweet in tweets:
    area_name = find_location(tweet)
    tweet["area_name"] = area_name

### Export to csv

In [19]:
import csv

with open("enriched_tweets.csv", 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)

    for row in tweets[:10]:
        targetrow = []
        
        targetrow.append(row["text"])
        targetrow.append(row["user"])
        targetrow.append(row["length"])
        targetrow.append(row["word_number"])
        targetrow.append(row["hasUrl"])
        targetrow.append(row["area_name"])
        targetrow.append(row["geo"]["coordinates"][0])
        targetrow.append(row["geo"]["coordinates"][1])
        
        writer.writerow(targetrow)

In [15]:
tweets[0]

{'_id': ObjectId('5a65bac7b986443b94e809c8'),
 'created_at': 'Mon Jan 22 10:19:50 +0000 2018',
 'text': 'Rumpi no secret💋 @ Amsterdam, Netherlands https://t.co/ybQlxBrYw8',
 'source': '<a href="http://instagram.com" rel="nofollow">Instagram</a>',
 'truncated': False,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': -9223372036573926637,
 'in_reply_to_screen_name': None,
 'user': -8556263967823196492,
 'geo': {'type': 'Point', 'coordinates': [52.3731, 4.8922]},
 'coordinates': {'type': 'Point', 'coordinates': [4.8922, 52.3731]},
 'place': {'id': '99cdab25eddd6bce',
  'url': 'https://api.twitter.com/1.1/geo/id/99cdab25eddd6bce.json',
  'place_type': 'city',
  'name': 'Amsterdam',
  'full_name': 'Amsterdam, The Netherlands',
  'country_code': 'NL',
  'country': 'The Netherlands',
  'bounding_box': {'type': 'Polygon',
   'coordinates': [[[4.7289, 52.278227],
     [4.7289, 52.431229],
     [5.079207, 52.431229],
     [5.079207, 52.278227]]]},
  'attributes': {}},
 'contributors':