In [None]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords

In [None]:
col_event_split = ['id','userId', 'createdAt', 'text', 'longitude', 'latitude', 'placeId', 'inReplyTo', 'placeLatitude', 'placeLongitude']

In [None]:
parse_dates = ['createdAt']

In [None]:
tweets = pd.read_csv('../twitter-swisscom/twex_event_corrected.tsv', sep="\t", encoding='utf-8', escapechar='\\', names=col_event_split, parse_dates=parse_dates, na_values='N', header=None, nrows=100)

In [None]:
tweets.head()

Different informations on the longitude and latitude are given, the columns longitude/latitude are the position of the Tweet as reported by the user or client application. The place longitude/latitude is  indicates that the tweet is associated (but not necessarily originating from) a Place. And as we can see on the head of the table, the place is not always set.

We decided to use the longitude/latitude columns to represent the position of a tweet and if they are null we will use the placeLatitude and placeLongitude. If both are null we will have to drop the entry as a tweet without position is not usefull for event detection.

In [None]:
print(tweets.shape)
tweets.dtypes

In [None]:
"""
Check if the longitude and latitude are set. 
If not, check for the place latitude and longitude and replace.
If not, drop the row
"""
def find_position(row):
    if(np.isnan(row['longitude'])) and (not np.isnan(row['placeLongitude'])):
        row['longitude'] = row['placeLongitude']
    if(not(np.isnan(row['longitude'])) and np.isnan(row['latitude']) and (not np.isnan(row['placeLatitude']))):
        row['latitude'] = row['placeLatitude']
    return row

In [None]:
tweets = tweets.apply(find_position, axis=1)
print(tweets.shape)

In [None]:
tweets.head()

We can now drop the columns placeLongitude and placeLatitude as they don't give us anymore informations.

In [None]:
tweets = tweets.drop(['placeLatitude', 'placeLongitude'], axis=1)

Then we drop the NaN values in Longitude and Latitude columns as we need a position to detect event.

In [None]:
len_before = len(tweets.index)
tweets = tweets.dropna(subset=['longitude', 'latitude'])
len_after = len(tweets.index)
print("Number of tweets before dropping the one without position : ", len_before)
print("Number of tweets before dropping the one without position : ", len_after)
print("Percentage of tweets lost : ", ((len_before - len_after)/len_before)*100)

## Event detection

We want to detect the event from the tweets we now extracted. To do so we had to do some assumptions. We decided to work with the text to find event. 
- Hashtags can be a good estimators of what event were on at the time of the tweets.
- Events can take place on several days but we will decide to take tweets day per day.

As we decided to detect an event by its day of occurence we create a new column that gives us the information of the day.

In [None]:
tweets['date'] = pd.DatetimeIndex(tweets['createdAt']).normalize()

TODO : Should we use the hour of the tweets or not ? (if not just delete the column 'createdAt')

We decide to remove the stopwords from the tweets' text to keep only words that can describe an event.

In [None]:
stop_words = set(stopwords.words('english'))
stop_words.update(stopwords.words('french'))
stop_words.update(stopwords.words('german'))
stop_words.update(stopwords.words('italian'))
stop_words.update(string.punctuation)

In [None]:
tweets['text'].apply(lambda x: [item for item in x if item not in stopwords])

Now that the words should be clean we can find occurences of the same word on the same day and find event from there.
We will create list with words still in the 'text' field and we also drop the row where the text is empty.

In [None]:
tweets.set_index('Date')
tweets.sort_index('Date')

In [None]:
tweets.dropna(subset=['text'], inplace=True)

In [None]:
def text_to_list(text):
    

In [None]:
tweets.apply(lambda row : text_to_list(row['text']), axis=1)