In [1]:
import pandas as pd
import numpy as np
import string
import nltk
import re
from nltk.corpus import stopwords

In [2]:
col_event_split = ['id','userId', 'createdAt', 'text', 'longitude', 'latitude', 'placeId', 'inReplyTo', 'placeLatitude', 'placeLongitude']

In [3]:
parse_dates = ['createdAt']

In [4]:
tweets = pd.read_csv('../twitter-swisscom/twex_event_corrected.tsv', sep="\t", encoding='utf-8', escapechar='\\', names=col_event_split, parse_dates=parse_dates, na_values='N', header=None, nrows=100)

In [5]:
tweets.head()

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,placeLatitude,placeLongitude
0,9514097914,17341045,2010-02-23 05:55:51,Guuuuten Morgen! :-),7.43926,46.9489,,,,
1,9514846412,7198282,2010-02-23 06:22:40,Still the best coffee in town — at La Stanza h...,8.53781,47.3678,,,,
2,9516574359,14657884,2010-02-23 07:34:25,It has been a week or so.. and today I just co...,6.13396,46.1951,,,,
3,9516952605,14703863,2010-02-23 07:51:47,Getting ready.. http://twitpic.com/14v8gz,8.81749,47.2288,,,,
4,9517198943,14393717,2010-02-23 08:02:57,Un peu de réconfort liquide en take away après...,6.63254,46.5199,,,,


Different informations on the longitude and latitude are given, the columns longitude/latitude are the position of the Tweet as reported by the user or client application. The place longitude/latitude is  indicates that the tweet is associated (but not necessarily originating from) a Place. And as we can see on the head of the table, the place is not always set.

We decided to use the longitude/latitude columns to represent the position of a tweet and if they are null we will use the placeLatitude and placeLongitude. If both are null we will have to drop the entry as a tweet without position is not usefull for event detection.

In [7]:
print(tweets.shape)
tweets.dtypes

(100, 10)


id                         int64
userId                     int64
createdAt         datetime64[ns]
text                      object
longitude                float64
latitude                 float64
placeId                  float64
inReplyTo                float64
placeLatitude            float64
placeLongitude           float64
dtype: object

In [8]:
"""
Check if the longitude and latitude are set. 
If not, check for the place latitude and longitude and replace.
If not, drop the row
"""
def find_position(row):
    if(np.isnan(row['longitude'])) and (not np.isnan(row['placeLongitude'])):
        row['longitude'] = row['placeLongitude']
    if(not(np.isnan(row['longitude'])) and np.isnan(row['latitude']) and (not np.isnan(row['placeLatitude']))):
        row['latitude'] = row['placeLatitude']
    return row

In [9]:
tweets = tweets.apply(find_position, axis=1)
print(tweets.shape)

(100, 10)


In [10]:
tweets.head()

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,placeLatitude,placeLongitude
0,9514097914,17341045,2010-02-23 05:55:51,Guuuuten Morgen! :-),7.43926,46.9489,,,,
1,9514846412,7198282,2010-02-23 06:22:40,Still the best coffee in town — at La Stanza h...,8.53781,47.3678,,,,
2,9516574359,14657884,2010-02-23 07:34:25,It has been a week or so.. and today I just co...,6.13396,46.1951,,,,
3,9516952605,14703863,2010-02-23 07:51:47,Getting ready.. http://twitpic.com/14v8gz,8.81749,47.2288,,,,
4,9517198943,14393717,2010-02-23 08:02:57,Un peu de réconfort liquide en take away après...,6.63254,46.5199,,,,


We can now drop the columns placeLongitude and placeLatitude as they don't give us anymore informations.

In [11]:
tweets = tweets.drop(['placeLatitude', 'placeLongitude'], axis=1)

Then we drop the NaN values in Longitude and Latitude columns as we need a position to detect event.

In [12]:
len_before = len(tweets.index)
tweets = tweets.dropna(subset=['longitude', 'latitude'])
len_after = len(tweets.index)
print("Number of tweets before dropping the one without position : ", len_before)
print("Number of tweets before dropping the one without position : ", len_after)
print("Percentage of tweets lost : ", ((len_before - len_after)/len_before)*100)

Number of tweets before dropping the one without position :  100
Number of tweets before dropping the one without position :  100
Percentage of tweets lost :  0.0


## Event detection

We want to detect the event from the tweets we now extracted. To do so we had to do some assumptions. We decided to work with the text to find event. 
- Hashtags can be a good estimators of what event were on at the time of the tweets.
- Events can take place on several days but we will decide to take tweets day per day.

As we decided to detect an event by its day of occurence we create a new column that gives us the information of the day.

In [13]:
tweets['date'] = pd.DatetimeIndex(tweets['createdAt']).normalize()

TODO : Should we use the hour of the tweets or not ? (if not just delete the column 'createdAt')

We decide to remove the stopwords from the tweets' text to keep only words that can describe an event.

In [14]:
stop_words = set(stopwords.words('english'))
stop_words.update(stopwords.words('french'))
stop_words.update(stopwords.words('german'))
stop_words.update(stopwords.words('italian'))
stop_words.update(string.punctuation)

We remove the stopwords but we also remove the URLs and the @ mentions as they are not useful to detect events

In [15]:
def text_process(row):
    text = row['text']
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@ \S+", "", text)
    text = re.sub(r"@\S+", "", text)
    text = text.split()
    text  = [word for word in text if word.lower() not in stop_words]
    row['text'] = text
    return row

In [16]:
tweets = tweets.apply(text_process, axis=1)
tweets.head()

Unnamed: 0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo,date
0,9514097914,17341045,2010-02-23 05:55:51,"[Guuuuten, Morgen!, :-)]",7.43926,46.9489,,,2010-02-23
1,9514846412,7198282,2010-02-23 06:22:40,"[Still, best, coffee, town, —, Stanza]",8.53781,47.3678,,,2010-02-23
2,9516574359,14657884,2010-02-23 07:34:25,"[week, so.., today, couldn't, focus, Sportif, ...",6.13396,46.1951,,,2010-02-23
3,9516952605,14703863,2010-02-23 07:51:47,"[Getting, ready..]",8.81749,47.2288,,,2010-02-23
4,9517198943,14393717,2010-02-23 08:02:57,"[peu, réconfort, liquide, take, away, après, d...",6.63254,46.5199,,,2010-02-23


Now that the words should be clean we can find occurences of the same word on the same day and find event from there.
We will create list with words still in the 'text' field and we also drop the row where the text is empty.

In [17]:
tweets = tweets.set_index('date')
tweets.head()

Unnamed: 0_level_0,id,userId,createdAt,text,longitude,latitude,placeId,inReplyTo
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-02-23,9514097914,17341045,2010-02-23 05:55:51,"[Guuuuten, Morgen!, :-)]",7.43926,46.9489,,
2010-02-23,9514846412,7198282,2010-02-23 06:22:40,"[Still, best, coffee, town, —, Stanza]",8.53781,47.3678,,
2010-02-23,9516574359,14657884,2010-02-23 07:34:25,"[week, so.., today, couldn't, focus, Sportif, ...",6.13396,46.1951,,
2010-02-23,9516952605,14703863,2010-02-23 07:51:47,"[Getting, ready..]",8.81749,47.2288,,
2010-02-23,9517198943,14393717,2010-02-23 08:02:57,"[peu, réconfort, liquide, take, away, après, d...",6.63254,46.5199,,


In [18]:
tweets.dropna(subset=['text'], inplace=True)

Now that the text is pretty much clean we want to get the hashtags from the tweets because they are really helpful to detect the events. So we create a column with the hashtags :

In [24]:
#tweets['hashtags']= [word for word in tweets['text'] if (word[0] in '$')]
test = [word for word in tweets['text'] if(word[0] in '#')]
test

[]