In [None]:
import pandas as pd
import time
from datetime import date, timedelta, datetime
import numpy as np
from scipy.stats import norm
import scipy.ndimage.filters
import re
import json
import matplotlib.pyplot as plt
import csv 
import os
import folium
import branca.colormap as cm
from folium.plugins import MarkerCluster
from folium import plugins
import ast
import math
import pickle
import sys

from helper import * # File containig some helper functions
from maps import *
from plots import *
from event_localization import *
from const import *

# [Link to interactive visualisation](https://nbviewer.jupyter.org/github/brunowicht/ada_project/blob/master/project/project.ipynb)

<div style="background-color:gray;padding:10px;margin:10px;color:white">
**Run cell below to load necessary data**
</div>

In [2]:
print("Loading df_tag...")
df_tag = pd.read_csv("../../twitter_dataset/cleaned_hashtag.csv", sep=';', encoding="utf-8", low_memory=False)

print("Loading group_hashtags...")
group_hashtags = pd.read_csv("../../twitter_dataset/hashtag_grouped.csv", sep=";", index_col=[0], usecols=[0, 1, 2])
group_hashtags.tweets_idx = group_hashtags.tweets_idx.apply(lambda s: ast.literal_eval(s))

print("Loading dic_tag_days...")
pickle_in = open("../../twitter_dataset/dic_tag_days.pickle","rb")
dic_tag_days = pickle.load(pickle_in)
pickle_in.close()
print("Done")

Definition of some constants relative to the dataset

# 0. Understanding the dataset

Before starting any kind of data analysis, we first have to review the content of our dataset and understand its meaning better. To do this, we use the data sample and the schema provided with the dataset.

## 0.1. Dataset structure
Let us first read the `schema.txt` file to understand what fields our dataset contains.

In [None]:
schema = pd.read_table('twitter-swisscom/schema.txt', delimiter='    ', engine='python',
                       names=['Field name', 'Type', 'Specification', 'Unsigned', 'Optional field', 'Format'] )
schema

The `schema.txt` contains information about each field in our dataset. We see that there are 20 possible fields for a tweet, but many of them are optional, so most tweets probably haver fewer specified field than that.

Here are the fields that will be the most useful:
- userId: to know who posted the tweet
- createdAt: to know when the tweet was posted
- text: content of the tweet
- longitude and latitude: to know from where the tweet was posted


## 0.2. Dataset contents
Now, let us have a look at the actual content of our dataset using the provided sample.

In [None]:
sample_df = pd.read_csv('twitter-swisscom/sample.tsv', encoding='utf-8', sep='\t', escapechar='\\', 
                        index_col='id', names=schema['Field name'], quoting=csv.QUOTE_NONE, na_values='N')
sample_df.head()

Let us check the percentage of `NaN` values for each field.

In [None]:
sample_df.isnull().sum().apply(lambda s: '{0:.2f} %'.format(100*s/sample_df.shape[0]))

We can observe that the `latitude` and `longitude` are quite often `NaN`, whereas the `placeLatitude`and `placeLongitude` fields are always specified. Therefore, we will use the latter fields.

Let us visualize the geographical distribution of the tweets in the sample dataset.

In [None]:
swiss_coord = [46.8, 8.2]
swiss_map = folium.Map(swiss_coord, zoom_start=8)
add_markers_to_map_(sample_df.rename(columns={"placeLongitude": "Latitude", "placeLatitude": "Longitude"}), swiss_map, False)

In [None]:
swiss_map

# 1. Data cleaning and pre-processing

In [None]:
# The only columns we keep before any computation
keep_col = ['id', 'userId', 'createdAt', 'text', 'placeLongitude', 'placeLatitude']

Here we have the script for reading the whole dataset, filter the tags and identification in the text and store everything in a dataframe.  
The function `get_hashtag(text)` takes text, extract the hashtags (all the hashtags have been lowercased because uppercase does not add any information).

In [None]:
df = pd.read_csv('twitter-swisscom/sample.tsv', encoding='utf-8', sep='\t', escapechar='\\',
                 names=schema['Field name'], quoting=csv.QUOTE_NONE, na_values='N')

# We only keep the column we are interested in
df = df[keep_col]

# Extract the hashtags from the text.
df['tag'] = df.text.apply(lambda t: get_hashtags(t))

# We will not use the text anymore, no need to keep it.
df = df.drop(['text'], axis=1)

# Rename latitude and longitute for easier future usage.
df = df.rename(columns={'placeLongitude': 'Longitude', 'placeLatitude': 'Latitude'})
df.head()

In [None]:
df.isnull().sum().apply(lambda s: '{0:.2f} %'.format(100*s/df.shape[0]))

To be able to easily group by day, month or year, we decided to add those three columns to our dataframe, so we can drop the createdAt column which contains also the time (with second precision) of the tweet post, but we will not need it. We still need to set the tweet id as the unique index of tweets to be able to retreive some information in the original dataset if needed.

In [None]:
df['day'] = df.createdAt.apply(lambda x : x[:10])
df['month'] = df.createdAt.apply(lambda x : x[:7])
df['year'] = df.createdAt.apply(lambda x : x[:4])
df = df.drop(['createdAt'],axis=1)
df = df.set_index('id')

We can then store the resulting dataframe in a new csv file which is around 3 times smaller than the original one. We can then filter tweet that contains at least one hashtag and thos which contains at least one identification to create subdataset as we want to work with hashtags and identifications.

In [None]:
df.to_csv('../../twitter_dataset/cleaned.csv', sep=';', encoding='utf-8')

In [None]:
df_tag = df[(df["tag"].astype(str) != '[]')]
df_tag.to_csv('../../twitter_dataset/cleaned_hashtag.csv', sep=';', encoding='utf-8')

# 2. Data Manipulation
## 2.1. Grouping by hashtag

The first step of our descriptive data analysis is to group all tweets by hashtags. In other words, we want to have a dataframe containing for each hashtag the indices of the tweets in which it appears. This will allow us to count find the most popular hashtags, and later agregate them by time.

First, we retrieve our cleaned dataset containing tweets with hashtags by loading it from the file we save in the previous step. This allows us to avoid unnecessary computations.

In [None]:
df_tag = pd.read_csv("../../twitter_dataset/cleaned_hashtag.csv", sep=';', encoding="utf-8")

Then we can implement the main function that will group our data by hashtag as described above.

For the sake of clarity, the following functions were moved in the `helper.py` file:
- `get_hashtags(text, lowercase=True)` : Returns the list of all hashtags present in the given text.
- `get_index_with_hashtag(df, hashtag)` : Returns the indices of the tweets in which the given hashtag appears.

In [None]:
def group_by_hashtag(df):
    """For each hashtag, give the indices of the tweets in which it appears.
    
    df: dataframe to use for the grouping
    load_hashtags_list: if True, loads the list of different hashtags from a file saved previously.
                        if False, computes the hashtag list again.
    """
    
    # Get the indices of the tweets in which each hashtag appears
    output = {}
    for index, item in df.iterrows():
        tags = get_hashtags(item.tag, lowercase=False)
        for tag in tags:
            if(tag in output):
                output_tag = output[tag]
                output_tag[0].append(index)
                output_tag[1].append(item.userId)
            else:
                output[tag] = [[index], [item.userId]]
    
    for tag, lists in output.items():
        output_tag = output[tag]
        output_tag[0] = list(np.unique(np.array(lists[0])))
        output_tag[1] = len(np.unique(np.array(lists[1])))
    
    
    
    # Convert to dataframe
    output_df = pd.DataFrame.from_dict(output, orient='index')
    output_df.reset_index(inplace=True)
    output_df.columns = ['hashtag', 'tweets_idx', 'nb_unique_authors']
    return output_df

Then we can actually call this function on our dataset. We store the result in a csv file so that we won't have to compute it again.

In [None]:
# Compute the groups by hashtag
group_hashtags = group_by_hashtag(df_tag)
group_hashtags = group_hashtags.set_index("hashtag")

# Save result to csv file
group_hashtags.to_csv("../../twitter_dataset/hashtag_grouped.csv", sep=";", encoding="utf-8", columns=["tweets_idx", "nb_unique_authors"])

In [None]:
# Read hashtag groups from file
group_hashtags = pd.read_csv("../../twitter_dataset/hashtag_grouped.csv", sep=";", index_col=[0], usecols=[0, 1, 2])
group_hashtags.tweets_idx = group_hashtags.tweets_idx.apply(lambda s: ast.literal_eval(s))
group_hashtags.head()

We now have for each hashtag of our dataset the indices of the tweets in which it appears. With that data, we can for example see what hashtags were tweeted the most.

In [None]:
# Get the occurence of each hashtag
group_hashtags['tweets_nb'] = group_hashtags['tweets_idx'].apply(lambda ls: len(ls))

# Get the 1510most popular hashtags
group_hashtags.sort_values('tweets_nb', ascending=False).drop(['tweets_idx'], axis=1).head(10)

# 3. Data Analysis and Visualization

## 3.1. Visualizing hashtag frequency
We want to be able to determine if a certain hashtag has a spike of popularity at a certain time. To do that, we visualize the number of tweets containing a given hashtag per unit of time (day, month, or year)

For the sake of clarity, the following functions were moved in the `helper.py` file:
- `search_hashtag(hashtag, df)` : Filter the given dataset to keep only elements that contain the given hashtag.
- `plot_frequency_tags(df, col, hashtag, n)` : Display a bar plot of the number of tweets with the given hashtag per unit of time given in 'col' (day, month or year) .

For example, let us take a look at the tweets with the hashtag '#jesuischarlie'.

In [None]:
df_Charlie = search_hashtag("#jesuischarlie", df_tag, group_hashtags)
df_Charlie.head(10)

In [None]:
plot_frequency_tags(df_tag, 'day', "#charliehebdo", 100, group_hashtags)

Let us now try this for events that take place every year at the same period, for example Eurovision and Paléo Festival.

In [None]:
plot_frequency_tags(df_tag, 'month', "#eurovision", 30, group_hashtags)

For Eurovision, we can clearly see a spike each year during the month of may, which is indeed when the contest takes place.

In [None]:
plot_frequency_tags(df_tag, 'day', "#paleo", 100, group_hashtags)

For Paléo Festival, we can again see a spike each year during the end of month of July, but this time we can also see that it lasts about a week.

## 3.2 Geographic event localisation

Here we can use the geographic information of the tweets to determine the location of an event. To do so, we display on a map the geolocation of each tweet that mentions a given hashtag. The following function were implemented in order to create this map.

Now let us try this with Eurovision and Paléo Festival, which were already used as examples previously. As we can see in the maps below, Eurovision is an international event since the geographic repartitions of tweet is quite homogenous over Switzerland. On the other hand, if we look at the hashtag '#paleo', we can quickly see that it takes place in Nyon because there is a very high concentration of tweets about Paléo in that area.

In [None]:
get_map_with_hasthtag("#eurovision", df_tag, group_hashtags)

In [None]:
get_map_with_hasthtag("#paleo", df_tag, group_hashtags)

# 4. Event detection

## 1. Filtering out irrelevant hashtags

Currently, we have way too many hashtags and it would be unfeasible run our event detection algorithm on every single one of them. Thus we have to filter out hashtags that are very unlikely to be detected as events by our algorithm.

In [None]:
print("Total number of hashtags : %s" % group_hashtags.shape[0])

First, we filter our hashtags by keeping only the ones that are used in more than **100** tweets. Then remove hashtags tweeted by less than **50** unique users. By filtering with respect to the number of unique users instead of the total number of tweets, we discard hashtags that were tweeted many times by only a few number of people. This could be the case if one or a few twitter bots tweet some hashtag an insane number of times in a short period.

In [None]:
def get_max_daily_unique_authors(hashtag_dic_days):
    return max(hashtag_dic_days.values())

In [None]:
group_df_filtered = group_hashtags.copy()
group_df_filtered = group_df_filtered[group_df_filtered['tweets_nb'] > 100]
group_df_filtered = group_df_filtered[group_df_filtered['nb_unique_authors'] > 50]

group_df_filtered['max_unique_author_day'] = group_df_filtered.index.map(lambda hashtag: get_max_daily_unique_authors(dic_tag_days.get(hashtag)))
group_df_filtered[group_df_filtered['max_unique_author_day'] > 5].shape[0]

group_df_filtered.shape[0]

In [None]:
# Get the list of the hashtag for the event detection.
tag_list = group_df_filtered.index.values
tag_list

In [None]:
def get_unique_author_per_frequency(df, frequency, df_tag):
    """
        Returns a dictionnary containing for each hashtags a dictionnary of dates
        and the corresponding number of user that has tweeted this hashtag frequency.
        
        df: group_df_filtered
        frequency: one of the following values : {"day", "month", "year"}
    """
    
    print('Computing dictionnary...')
    dic_tag_days = {}
    nb_tags = df.shape[0]
    row_nb = 0
    
    for tag, row in df.iterrows():
        row_nb = row_nb + 1
        grouped_day = df_tag.iloc[row['tweets_idx']]

        grouped_day = grouped_day.drop_duplicates(["userId", frequency], "first")
        dic_tag_days[tag] = grouped_day[frequency].value_counts()
        
        sys.stdout.write("\r{0:.2f}%".format((float(row_nb)/nb_tags)*100))
        sys.stdout.flush()
    
    return dic_tag_days

In [None]:
# Compute dictionnary of hashtags occurences per day
dic_tag_days = get_unique_author_per_day(group_df_filtered, "day", df_tag)

# Save dictionnary to file
print('\nSaving dictionnary to file...')
pickle_out = open("../../twitter_dataset/dic_tag_days.pickle","wb")
pickle.dump(dic_tag_days, pickle_out)
pickle_out.close()

In [None]:
# Load dictionnary of hashtags occurences per day from file
pickle_in = open("../../twitter_dataset/dic_tag_days.pickle","rb")
dic_tag_days = pickle.load(pickle_in)
pickle_in.close()

## 2. Event Location algorithm

In [None]:
plot_hashtag_occurence_for_dates('#facebook', dic_tag_days, date(2014, 5, 15), date(2014, 8, 1))

In [3]:
def get_average(hashtag, dic_tag_days, start_date, end_date):
    """Returns the average number of unique authors of the given hashtag between the given dates.
    """
    # Get the total number of days to be displayed
    delta = end_date - start_date
    nb_days = delta.days + 1
    
    # Create the array of string containing every day between the given end and start dates
    dates = [str(start_date + timedelta(days=i)) for i in range(nb_days)]
    # Get the corresponding number of unique users having tweeted the hashtag
    nb_authors = [dic_tag_days.get(hashtag).get(d, 0) for d in dates]
    
    mean = np.mean(nb_authors)
    return mean

def get_event_score(day, dic_hashtag, moving_avg, event_kernel, max_factor=2):
    """Returns the event score for a given day.
       This is obtained by innerproduct between the number of unique authors and the given kernel, 
       and then dividing by the given moving average (which is an average over values around that day).
    """
    # Get the number of unique authors for the given day
    day_value = dic_hashtag.get(str(day), 0)
    
    # If no tweets that day, the score is 0
    if day_value == 0:
        event_score = 0
    else:
        # Compute the inner product of the values with the given kernel centered around the day of interest
        h = int(len(event_kernel) / 2)
        event_inner_prod = 0
        for i in range(-h, h+1):
            d = day + timedelta(i)
            # For the inner product, we limit the values used to max_factor*day_value
            value_i = min(max_factor*day_value, dic_hashtag.get(str(d), 0))
            event_inner_prod += event_kernel[i + h] * min(max_factor*day_value, dic_hashtag.get(str(d), 0))
        
        # Divide the innerproduct by the moving average to get the event score
        event_score = event_inner_prod / max(1, moving_avg)
    
    return event_score

def my_detect_event(hashtag, dic_tag_days, threshold=4, averaging_window=35):
    """Detect potential events related to the given hashtag. 
       Returns the detected event dates, and an array of event scores for each day in the dataset.
    """
    event_kernel = [0.1, 0.8, 0.1]
    dic_hashtag = dic_tag_days.get(hashtag)
    
    # Create the array containing every day included in the dataset
    dates = np.asarray([DATASET_BEGIN_DATE + timedelta(days=i) for i in range(DATASET_TOTAL_DAYS)])
    
    # Compute the the average and variance for the first moving window
    moving_avg_window_end = DATASET_BEGIN_DATE + timedelta(days=averaging_window - 1)
    moving_avg = get_average(hashtag, dic_tag_days, DATASET_BEGIN_DATE, moving_avg_window_end)
    half_window = int(averaging_window / 2)
        
    # Compute the event score for each day
    event_score = np.zeros(len(dates)) 
    for i in range(half_window, len(dates) - half_window):
        if i != half_window:
            moving_avg_update = (dic_hashtag.get(str(dates[i+half_window]),0) - dic_hashtag.get(str(dates[i-half_window-1]),0)) / averaging_window
            moving_avg = moving_avg + moving_avg_update
        
        event_score[i] = get_event_score(dates[i], dic_hashtag, moving_avg, event_kernel)
        
    # Get the dates at which an event was detected
    event_dates = dates[np.where(event_score >= threshold)]
    event_dates = [str(d) for d in event_dates]
    
    return (event_dates, event_score)

def my_detect_events(dic_tag_days, threshold=4, averaging_window=35):
    event_dic = {}
    nb_tags = len(dic_tag_days.keys())
    print("Detecting events...")
    
    for tag_idx, tag in enumerate(dic_tag_days.keys()):
        detected_events = my_detect_event(tag, dic_tag_days, threshold=threshold, averaging_window=averaging_window)[0]
        
        if len(detected_events) > 0:
            event_dic[tag] = detected_events
        
        sys.stdout.write("\r{0:.2f}%".format((float(tag_idx+1)/nb_tags)*100))
        sys.stdout.flush()
    print("\n")
    
    return event_dic

In [4]:
event_dic = my_detect_events(dic_tag_days)

print("Events were found for {} hashtags out of {}".format(len(event_dic.keys()), len(dic_tag_days.keys())))

Detecting events...
100.00%

Events were found for 2108 hashtags out of 6197


In [7]:
event_dic

{'#ausopen': ['2013-01-20',
  '2014-01-23',
  '2014-01-26',
  '2015-01-25',
  '2016-01-28',
  '2016-01-30',
  '2016-01-31'],
 '#blatter': ['2014-06-12',
  '2015-05-29',
  '2015-06-02',
  '2015-06-03',
  '2015-12-21'],
 '#happyhalloween': ['2012-10-31',
  '2013-10-31',
  '2014-10-31',
  '2014-11-01',
  '2015-10-31'],
 '#wef16': ['2016-01-20', '2016-01-21', '2016-01-22'],
 '#geralg': ['2014-06-30'],
 '#not': ['2014-01-02'],
 '#vin': ['2014-03-22'],
 '#noexcuses': ['2014-01-02'],
 '#mega': ['2013-01-20'],
 '#storytelling': ['2016-03-08'],
 '#honsui': ['2014-06-25'],
 '#white': ['2012-12-14',
  '2013-06-15',
  '2014-12-27',
  '2014-12-28',
  '2015-01-26'],
 '#bugatti': ['2016-02-29', '2016-03-01'],
 '#nedmex': ['2014-06-29'],
 '#fcbayern': ['2013-04-23', '2013-05-25', '2016-03-16'],
 '#doctorwho': ['2013-11-23', '2013-12-25', '2014-08-23'],
 '#exhibition': ['2016-01-20'],
 '#tourbillon': ['2016-03-20'],
 '#icehockey': ['2013-05-18'],
 '#parisagreement': ['2015-12-12', '2015-12-13', '2016-0

In [None]:
hashtag = "#christmas"
threshold = 4
(event_dates, event_score) = my_detect_event(hashtag, dic_tag_days, threshold=threshold)
plot_hashtag_and_event_score(hashtag, event_score, dic_tag_days, date(2015, 11, 1), date(2015, 12, 31), threshold=threshold)
event_dates

In [None]:
hashtag = "#jesuischarlie"
threshold = 6
(event_dates, event_score) = my_detect_event(hashtag, dic_tag_days, threshold=threshold)
plot_hashtag_and_event_score(hashtag, event_score, dic_tag_days, date(2015, 1, 1), date(2015, 1, 30), threshold=threshold)
event_dates

In [None]:
hashtag = "#paleo"
threshold = 4
(event_dates, event_score) = my_detect_event(hashtag, dic_tag_days, threshold=threshold)
plot_hashtag_and_event_score(hashtag, event_score, dic_tag_days, date(2014, 7, 10), date(2014, 7, 31), threshold=threshold)
event_dates

## 3. Event localization

In [None]:
hashtag = '#paleo'
dates = [date(2016, 7, 19), date(2016, 7, 20), date(2016, 7, 21), date(2016, 7, 22), date(2016, 7, 23)]
display_event_map(hashtag, dates, df_tag, group_hashtags)

In [None]:
hashtag = '#riprobinwilliams'
dates = [date(2014, 8, 12)]
display_event_map(hashtag, dates, df_tag, group_hashtags)

In [8]:
events_locations = get_events_locations(event_dic, df_tag, group_hashtags)

print("Locations were found for {} events out of {}".format(len(events_locations), len(event_dic.keys())))

Computing events locations...
100.00%

Locations were found for 238 events out of 2108


In [9]:
events_locations

{'#adele': (array([ 47.3774 ,   8.53676]), 0.21694693877551025),
 '#ai': (array([ 46.2048 ,   6.14319]), 0.09013428571428575),
 '#air14': (array([ 46.8213 ,   6.93576]), 0.142516462585034),
 '#amg': (array([ 46.2363,   6.1185]), 0.11226272727272719),
 '#amr': (array([ 46.2048 ,   6.14319]), 0.0),
 '#anges5': (array([ 46.2088 ,   6.09362]), 0.1902239999999992),
 '#annecy': (array([ 45.8497 ,   6.10093]), 0.018042000000000013),
 '#annecyfestival': (array([ 45.8497 ,   6.10093]), 0.012216972477064479),
 '#artbasel': (array([ 47.5596 ,   7.58314]), 0.046438876146789015),
 '#artbasel2015': (array([ 47.5596 ,   7.58314]), 0.028425153374233087),
 '#artbasel2016': (array([ 47.5596 ,   7.58314]), 0.0074903999999999908),
 '#artist': (array([ 47.5596 ,   7.58314]), 0.027513636363636351),
 '#astonmartin': (array([ 46.2363,   6.1185]), 0.012381818181818393),
 '#audi': (array([ 46.2363,   6.1185]), 0.083039253731343202),
 '#avoriaz': (array([ 46.1381,   6.6796]), 0.040759999999999991),
 '#basel': (a

In [None]:
event_dic['#papafrancesco']

In [10]:
hashtag = '#wef14'
print(event_dic[hashtag])
dates = [datetime.strptime(d, "%Y-%m-%d").date() for d in event_dic[hashtag]]
display_event_map(hashtag, dates, df_tag, group_hashtags)

['2014-01-22', '2014-01-23', '2014-01-24', '2014-01-25']
Detected event location : [ 46.765     9.81838]
Mean deviation : 0.13223074148296604
