# Twitter-Swisscom Project

## EVENT DETECTION 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from utils import *
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
from scipy.cluster.hierarchy import ward, dendrogram,fcluster

from sklearn.cluster import DBSCAN
from sklearn import metrics

import datashader as ds
import datashader.transfer_functions as tf

In [None]:
col_event_split = ['id','userId', 'createdAt', 'text', 'longitude', 'latitude', 'placeId', 'inReplyTo', 'placeLatitude', 'placeLongitude']

In [None]:
parse_dates = ['createdAt']

In [None]:
tweets = pd.read_csv('../twitter-swisscom/twex_event_corrected.tsv', sep="\t", encoding='utf-8', escapechar='\\', names=col_event_split, parse_dates=parse_dates, na_values='N', header=None)

In [None]:
tweets.head()

We will not use the 'placeId' and the 'inReplayTo' informations to do our detection, so we drop them now.

In [None]:
tweets.drop(['placeId', 'inReplyTo'], inplace=True, axis=1)

Different informations on the longitude and latitude are given, the columns 'longitude'/'latitude' are the position of the Tweet as reported by the user or client application. The 'placeLongitude'/'placeLatitude' indicates that the tweet is associated to a place. And as we can see on the head of the table, the place is not always set.

We decided to use the longitude/latitude columns to represent the position of a tweet and if they are null we will use the placeLatitude and placeLongitude. If both are null we will have to drop the entry as a tweet without position is not usefull for event detection.

In [None]:
tweets = tweets.apply(replace_position, axis=1)

In [None]:
tweets.head()

We can now drop the columns 'placeLongitude' and 'placeLatitude' as they don't give us anymore informations.

In [None]:
tweets.drop(['placeLatitude', 'placeLongitude'],inplace=True, axis=1)

Then we drop the NaN values in Longitude and Latitude columns as we need a position to detect event.

In [None]:
len_before = len(tweets.index)
tweets = tweets.dropna(subset=['longitude', 'latitude'])
len_after = len(tweets.index)
print("Number of tweets before dropping the one without position : ", len_before)
print("Number of tweets before dropping the one without position : ", len_after)
print("Percentage of tweets lost : ", ((len_before - len_after)/len_before)*100)

As we also base our event detection on the text field we don't want to have nan value in it. So we drop them.

In [None]:
len_before = len(tweets.index)
tweets = tweets.dropna(subset=['text'])
len_after = len(tweets.index)
print("Number of tweets before dropping the one without text : ", len_before)
print("Number of tweets before dropping the one without text : ", len_after)
print("Percentage of tweets lost : ", ((len_before - len_after)/len_before)*100)

We decide to detect an event by its day of occurence we create a new column that gives us the information of the day.

In [None]:
tweets['day'] = pd.DatetimeIndex(tweets['createdAt']).normalize()

We decide to remove the stopwords from the tweets' text to keep only words that can describe an event.

In [None]:
stop_words = stopwords.words('english')
stop_words += stopwords.words('french')
stop_words += stopwords.words('german')
stop_words += stopwords.words('italian')
stop_words += string.punctuation
stop_words += ['—','/via','via', 'follow', 'please', 'i\'m', '^_^', ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
':c', ':{', '>:\\', ';(', ':-)', ':)', ';)','[=o)]', ';-)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
'=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
'<3']

In [None]:
def text_process(row):
    text = row['text']
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@ \S+", "", text)
    text = re.sub(r"@\S+", "", text)
    text = text.split()
    text  = [word for word in text if word.lower() not in stop_words]
    row['text'] = text
    return row

We remove the stopwords but we also remove the URLs and the @ mentions as they are not useful to detect events

In [None]:
tweets = tweets.apply(text_process, axis=1)
tweets.head()

Now that the text is pretty much clean we want to get the hashtags from the tweets because they are really helpful to detect the events. So we create a column with the hashtags :

In [None]:
tweets['hashtags'] = tweets['text'].apply(find_hashtags)

And in the "text" fields we don't want to have '#' anymore.

In [None]:
tweets['text'] = tweets['text'].apply(remove_hashtags)

We save the current dataframe so we don't need to re-run the preprocessing everytime as it takes a lot of time.

In [None]:
tweets.to_csv("../twitter-swisscom/twex_event_tweets_processed.csv", sep=',', encoding='utf-8', index=False)

## EVENT  DETECTION

Now that our dataframe is clean we want to find the event hidden in all those tweets. Here are our assumptions to find an event :
- An event is described by a hashtags it contains
- An event takes place at a certain place described by longitude/latitude (we took a radius of 10km)
- An event occurs during a certain  time (we decided to find them day by day)
- An event has a minimum number of people talking about it (we took 3) and a minimum of tweets (we took 8)

We load the csv file containing our tweets preprocessed.

In [None]:
tweets = pd.read_csv("../twitter-swisscom/twex_event_tweets_processed.csv", sep=',', encoding='utf-8')

In [None]:
tweets.head()

As we want to detect event day per day we will have to go through all the day in our data and for each of them detect the events.

In [None]:
days = np.unique(tweets['day'].values)
print(len(days))

### Detection on one day

Before applying the detection on the whole data, we want to do the detection on a specific day so that the pipeline is easy to understand.

In [None]:
from random import randint
index = randint(0,len(days))

day = tweets[tweets['day']==days[index]]
day.head()

We set the radius where we want to find the related tweets

In [None]:
meters = 10000
eps = meters / 100000

We use [DBSCAN](http://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html) to detect clusters of tweets that are close to each others. And we use this cluster assignment to detect events.

In [None]:
X = day.as_matrix(columns=['latitude', 'longitude'])
db = DBSCAN(eps=eps, min_samples=5).fit(X)
core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True
labels = db.labels_

In [None]:
day = day.assign(cluster=db.labels_)

In [None]:
n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
print('Estimated number of clusters: %d' % n_clusters_)

We print the clusters position for this given day.

In [None]:
unique_labels = set(labels)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))
for k, col in zip(unique_labels, colors):
    if k == -1:
        col = 'k'

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

## Create the dataframe containing the events

We go through the clusters and find out if the corresponding tweets are linked by an event or not. To do so we first of all create a dataframe that will contain the events. (Name, date, keywords, hashtags, longitude, latitude, #of tweets, #of people, tweetids)

In [None]:
df_event = pd.DataFrame(columns=['name', 'date', 'keywords', 'hashtags', 'longitude', 'latitude', '# of tweets', '# of people', 'tweetids'])

Now we want to populate our dataframe with real event. So we go through all the clusters and find out the hashtags, keywords and corresponding event based on these two infos. We use the same pipeline we used above to detect the clusters.

In [None]:
for day in days:
    day_df = tweets[tweets['day']==day]
    
    X = day_df.as_matrix(columns=['latitude', 'longitude'])
    db = DBSCAN(eps=eps, min_samples=5).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    
    day_df = day_df.assign(cluster=db.labels_)
    
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    
    for cluster in range(0,n_clusters_-1):
        ntweets = len(day_df[day_df['cluster']==cluster])
        npeople = len(np.unique(day_df[day_df['cluster']==cluster].userId))
        ids = list(day_df[day_df['cluster']==cluster].id.values)
        d_hashtags = dictionnary_from_hashtags(day_df, cluster)
        d_keywords = dictionnary_from_keywords(day_df, cluster)

        d_hashtags_detection = useful_(d_hashtags, 1/3, ntweets)
        d_keywords_detection = useful_(d_keywords, 1/4, ntweets)

        if(any(d_hashtags_detection)):
            position = find_position(day_df, cluster, ntweets)
            i = len(df_event)
            name = ""
            for elem in d_hashtags_detection.keys(): 
                name += elem + " "
            df_event.loc[i] = ([name[:-1], day, list(d_keywords_detection.keys()), list(d_hashtags_detection.keys()), position[0], position[1], ntweets, npeople, ids])
            print(" ------- Event added -------")

In [None]:
len_event_no_filter = len(df_event)
print("Number of event without filtering the spam neither the event not in the scope : ", len_event_no_filter)

We want to avoid having spam event in our event dataframe. Here are the 3 assumptions we did to avoid the spam events :
- (number of tweets) / (number of people) has to be less than 6
- number of people involved needs to be greater or equal to 3.
- number of tweets per event needs to be greater or equal to 8

In [None]:
df_event = df_event[df_event["# of tweets"]/df_event["# of people"] < 6]

In [None]:
df_event = df_event[(df_event["# of people"] >= 3) & (df_event["# of tweets"] >= 8)]

In [None]:
len_event_no_spam = len(df_event)
print("Number of event after filtering the spam : ", len_event_no_spam)

Our dataset should be containing tweets between 2010 and 2016 but we still want to make sure we don't have outliers so we filter it.

In [None]:
df_event['date'] = pd.DatetimeIndex(df_event['date']).normalize()

In [None]:
df_event = df_event[(df_event.date >= "2010-01-01") & (df_event.date <= "2016-12-31")]

We save our dataframe to find the countries linked with each event.

In [None]:
df_event.to_csv("../twitter-swisscom/twex_event_no_country.csv", sep=',', encoding='utf-8', index=False)

Here is the pipeline we used to find the country given the longitude/latitude of an event :
- Install OSGeo4W : https://trac.osgeo.org/osgeo4w/wiki
- Open the OSGeo4W shell
- Go to the directory where our script is (/country-detection)
- install pandas : pip install pandas
- run the script : python find_country.py



(in our script we use these two tools : https://github.com/che0/countries)

We then reload the dataframe with the countries and remove the one that are not in Switzerland

In [None]:
df_event = pd.read_csv("../twitter-swisscom/twex_event_country.csv", sep=',', encoding='utf-8')

In [None]:
len_total = len(df_event)
df_event = df_event[df_event.country == 'CH']
len_swiss = len(df_event)
print("Total number of events : ", len_total)
print("Number of events in Switzerland : ", len_swiss)
print("Loss percentage : ", (1-(len_swiss/len_total))*100, "%")

In [None]:
len_event_final = len(df_event)
print("Number of event after filtering the event not in the scope (temporal and spatial) : ", len_event_final)

In [None]:
df_event.head()

We save our dataframe to a csv as all the detection is done and we want to use it for our visualization now.

In [None]:
df_event.to_csv("../twitter-swisscom/twex_event_final_events.csv", sep=',', encoding='utf-8', index=False)

## After visualization correction

We reload our csv and make some tweaks to make our detection more precise.

In [None]:
df_event = pd.read_csv("../twitter-swisscom/twex_event_final_events.csv", sep=',', encoding='utf-8')

In [None]:
df_event.head()

After vizualizing our data on several years, we see that some events seems to not be spatial but only like trending hashtags, we can detect them by finding events that take place at the same time but in many different places. We don't want to show them on the map as there position is not really representative.

In [None]:
df_event['trending'] = "False"

In [None]:
trending_event = df_event.groupby(['name','date'])

In [None]:
namelist = []
def getnames(group):
    name = group.name
    if len(group['longitude'])>1:
        namelist.append(name[0])

trending_event.apply(getnames)

In [None]:
namelist

In [None]:
df_event['trending'] = df_event['name'].isin(namelist)

In [None]:
len(df_event[df_event['trending']==True])

We also found out that some events were duplicated because they are were taking place on different following days. We don't want to make two different bubbles on our vizualization for the same event, so we will group them. We do it in our vizualization.

In [None]:
len(df_event)

Our final event dataframe is saved and used in the viz.

In [None]:
df_event.to_csv("../twitter-swisscom/event_detected_final.csv", sep=',', encoding='utf-8', index=False)