# Preprocess the data to get a clean dataset to work on

We need two different datasets to do the two different tasks we want to achieve :

1 - [Mobility Pattern](#mobility)

2 - [Event Detection](#event)

In [1]:
import csv 
import os
import shutil

In [None]:
def RepresentsFloat(s):
    try: 
        float(s)
        return True
    except ValueError:
        return False

### 1 - <a id='mobility'>Mobility Pattern</a>

Here is the columns we want to keep for mobility pattern detection :

    id     
    userId
    createdAt
    longitude
    latitude
    placeId
    placeLatitude
    placeLongitude
    sourceName 
    userLocation

We verify that the longitude, latitude, placeLatitude and placeLongitude are either float or '\N' because sometimes in the text field we get a '\t' that displace the values of the following columns.

We go through the chunks of the original data and keep only columns we want

In [None]:
path_before = 'twitter-swisscom/twex_split/'
path_after = 'twitter-swisscom/twex_split_mobility_processed/'

In [None]:
for filename in os.listdir(path_before):
    with open(str(path_before) + str(filename),'r',encoding ='utf8') as tsv:
        AoA = [line.strip().split('\t') for line in tsv]
        
    file_out = open(str(path_after) + str(filename),'w',encoding ='utf8')
    for row in AoA:
        if len(row)==20:
            if ((RepresentsFloat(row[4]) | (row[4]=='\\N'))& (RepresentsFloat(row[5]) | (row[5]=='\\N')) & (RepresentsFloat(row[10]) | (row[10]=='\\N')) & (RepresentsFloat(row[11]) | (row[11]=='\\N'))):
                for i in [0,1,2,4,5,6,10,11,12,19]:
                    file_out.write(str(row[i]) + "\t")    
                file_out.write("\n")
file_out.close()
print("DONE")

We create the wanted tsv file !

In [None]:
with open('twitter-swisscom/twex_mobility.tsv', 'w', encoding='utf8') as outfile:
    for filename in os.listdir(path_after):
        with open((str(path_after) + str(filename)), 'r', encoding='utf8') as infile:
            for line in infile:
                outfile.write(line)
print("DONE")

In [None]:
shutil.rmtree('twitter-swisscom/twex_split_mobility_processed')

In [None]:
with open('twitter-swisscom/twex_mobility.tsv', 'r', encoding='utf8') as infile:
    with open('twitter-swisscom/mobility/twex_mobility_corrected.tsv',"w", encoding='utf8') as outfile:
        for row in infile:
            outfile.write(str(row[:-2]) + "\n")
            outfile.newlines
print("DONE")

In [None]:
os.remove('twitter-swisscom/twex_mobility.tsv')

### 2 - <a id='event'>Event detection<a>

Here is the columns we want to keep for Event detection :

    id     
    userId
    createdAt
    text     
    longitude
    latitude
    placeId
    inReplyTo 
    placeLatitude
    placeLongitude

We verify that the longitude, latitude, placeLatitude and placeLongitude are either float or '\N' because sometimes in the text field we get a '\t' that displace the values of the following columns.

We go through the chunks of the original data and keep only columns we want

In [None]:
path_before = 'twitter-swisscom/twex_split/'
path_after = 'twitter-swisscom/twex_split_event_processed/'

In [None]:
for filename in os.listdir(path_before):
    with open(str(path_before) + str(filename),'r',encoding ='utf8') as tsv:
        AoA = [line.strip().split('\t') for line in tsv]
        
    file_out = open(str(path_after) + str(filename),'w',encoding ='utf8')
    for row in AoA:
        if len(row)==20:
            if ((RepresentsFloat(row[4]) | (row[4]=='\\N'))& (RepresentsFloat(row[5]) | (row[5]=='\\N')) & (RepresentsFloat(row[10]) | (row[10]=='\\N')) & (RepresentsFloat(row[11]) | (row[11]=='\\N'))):
                for i in [0,1,2,3,4,5,6,7,10,11]:
                    file_out.write(str(row[i]) + "\t")    
                file_out.write("\n")
file_out.close()
print("DONE")

We create the wanted tsv file !

In [None]:
with open('twitter-swisscom/twex_event.tsv', 'w', encoding='utf8') as outfile:
    for filename in os.listdir(path_after):
        with open((str(path_after) + str(filename)), 'r', encoding='utf8') as infile:
            for line in infile:
                outfile.write(line)
print("DONE")

In [None]:
shutil.rmtree('twitter-swisscom/twex_split_event_processed')

In [None]:
with open('twitter-swisscom/twex_event.tsv', 'r', encoding='utf8') as infile:
    with open('twitter-swisscom/twex_event_corrected.tsv',"w", encoding='utf8') as outfile:
        for row in infile:
            outfile.write(str(row[:-2]) + "\n")
            outfile.newlines
print("DONE")

In [None]:
os.remove('twitter-swisscom/twex_event.tsv')