In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import json
import gzip
import csv
import io
from polyglot.detect import Detector
import icu
from pprint import pprint

In [2]:
!ls

'Fortunato - 2009 - Community detection in graphs.pdf'
'Gerlach, Peixoto, Altmann - 2018 - A network approach to topic models.pdf'
 read_filter_data-Copy1.ipynb
 read_filter_data-Copy2.ipynb
 read_filter_data-Copy3.ipynb
 read_filter_data.ipynb
 read_filter_data_saved.ipynb
 testoutput.txt.gz
 Twitter_exercise_solution.ipynb
 twitter_exercises_solution.ipynb
 World_171016_103536_filtered.txt.gz
 World_171016_103536.txt.gz
 World_171016_111937_filtered.txt
 World_171016_111937_filtered.txt.gz
 World_171016_111937.txt.gz



#### Source for reading in the gzipped text file:
https://stackoverflow.com/questions/23091770/how-do-you-iterate-through-a-gzipped-carriage-return-file-using-python-2-7

https://www.makeuseof.com/tag/json-python-parsing-simple-guide/

#### Only reading the file and count its rows: 

* CPU times: user 1min 8s, sys: 656 ms, total: 1min 9s.
* Wall time: <font color='red'>1min 9s</font>, approx. <font color='green'>30 MB/s</font>.

        %%time
        i=0
        with io.TextIOWrapper( io.BufferedReader(gzip.open(filename)) ) as file:
            for line in file:
                i += 1
        file.close()
        print(i)


#### Reading in compressed zip and parse lines to json

* CPU times: user 4min 36s, sys: 886 ms, total: 4min 37s.
* Wall time: <font color='red'>4min 37s</font>, approx. <font color='green'>7.5 MB/s</font>.

        %%time
        with io.TextIOWrapper( io.BufferedReader(gzip.open(filename)) ) as file:
            for line in file:
                json_parsed = json.loads(line)
        file.close()

Number of rows: 3.764.667, file size (compressed): 2.0 GB (1.989.239.557 bytes).

#### Doing the filtering process and saving back to a compressed text file
* Input: World_171016_111937.txt.gz. output: World_171016_111937_filtered.txt.gz.

* CPU times: user 6min 36s, sys: 1.12 s, total: 6min 37s.
* Wall time: <font color='red'>6min 38s</font>,  approx. <font color='green'>5 MB/s</font>.

* Input: number of rows: 3.764.667, file size (compressed): 2.0 GB (1.989.239.557 bytes).
* Output: number of rows: 1.588.596, only text kept, file size (compressed): 66.5 MB (66.549.841 bytes).

#### Doing the filtering process for all the compressed text files
* Input, output: each separate file is read, filtered and then saved to compressed txt.
* According to the estimations 1 thread works with a speed of <font color='green'>5 MB/s</font>, so the <b><font color='red'>CPU time required is about 35-40 hours</font></b>.
* <b><font color='brown'>4 jupyter kernels are launched with a given range of filenames to work on to speed up the process.</font></b>
* <b><font color='purple'>The memory consumption of each kernel is about 100 MB</font></b>!
* After the process ended, the CPU time of 1 thread (the first 1-2 hours not shown, because threads had to be rearranged):

    * CPU times: user 8h 25min 50s, sys: 1min 47s, total: 8h 27min 37s.
    * Wall time: <font color='red'>8h 35min 59s</font>.
* Output after merging (wall time: <font color='red'>1h 44min 16s</font>): number of rows: <font color='red'>492.866.843</font>, file size (compressed): 20.7 GB (20.690.310.158 bytes).

#### Number of twitter messages to build the network with: <font color='blue'>492.866.843</font>.

#### Source for language detection:

https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language

##### Package installation:
https://polyglot.readthedocs.io/en/latest/Installation.html

#### Removing urls:
https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

In [3]:
# small file for testing
filename = 'World_171016_103536.txt.gz'

In [4]:
%%time
with io.TextIOWrapper( io.BufferedReader(gzip.open(filename)) ) as file:
    for line in file:
        json_parsed = json.loads(line)
file.close()

CPU times: user 56.9 ms, sys: 375 µs, total: 57.3 ms
Wall time: 56.5 ms


In [5]:
tweet = json_parsed['text']
type(tweet), tweet

(str,
 '다 좋은데, 사무실 들어가다 갑자기 만나 시위대 행진으로 버스가 안다님. 계속 대기중. #경복궁역 https://t.co/dfAjFATB47')

#### Filtering out url links from tweets

In [6]:
tweet = re.sub(r'http\S+', '', tweet)
tweet

'다 좋은데, 사무실 들어가다 갑자기 만나 시위대 행진으로 버스가 안다님. 계속 대기중. #경복궁역 '

In [7]:
for language in Detector(tweet).languages:
        print(language)

name: Korean      code: ko       confidence:  99.0 read bytes:  3676
name: un          code: un       confidence:   0.0 read bytes:     0
name: un          code: un       confidence:   0.0 read bytes:     0


In [8]:
str(Detector(tweet).languages[0].locale ) # best guess, if 'un' then unknown

'ko'

In [9]:
json_parsed['lang'] # this info comes from the tweet

'ko'

In [10]:
Detector(tweet).languages[0].confidence # confidence of best guess

99.0

#### Removing retweets
https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
"Users can amplify the broadcast of Tweets authored by other users by retweeting . Retweets can be distinguished from typical Tweets by the existence of a <font color='red'>retweeted_status</font> attribute. This attribute contains a representation of the original Tweet that was retweeted. Note that retweets of retweets do not show representations of the intermediary retweet, but only the original Tweet. (Users can also unretweet a retweet they created by deleting their retweet.)"

In [11]:
'retweeted_status' not in list(json_parsed.keys()) # indicator if tweet is a retweet

True

### Up-scale the filtering process to be able to work with large files

In [12]:
def language_detector(text):
    '''Returns language type as string or 'un' if unknown.'''
    try:
        polyglot = Detector(text).languages[0]
        if polyglot.confidence > 50: # threshold for detecting languages
            lang = str(polyglot.locale)
        else:
            lang = 'un'
    except Exception as e:
        print("Error: " + str(e))
        lang = 'un' # if error occurs during conversion language is unknown
    return lang

In [13]:
def English_language_filtering( json_parsed ):
    if (json_parsed['lang'] == 'en'): # tweet is in English so nothing need to be done here
        return True
    else:
        text = json_parsed['text'] # get text of tweet
        text = re.sub(r'http\S+', '', text) # get rid of http links
        
        lang = language_detector(text)
        
        if( lang == 'en' ):
            return True
        else:
            return False

In [14]:
%%time
#i=0
file_out = gzip.open('testoutput.txt.gz', mode='wb')

with io.TextIOWrapper( io.BufferedReader(gzip.open(filename)) ) as file_in:
    for line in file_in: # read one line at a time from a compressed file
        json_parsed = json.loads(line) # parse only that one line to json
        lang_bool = English_language_filtering(json_parsed) # True: English, False: not English
        retweet_bool = 'retweeted_status' not in list(json_parsed.keys()) # True: not retweet, False: retweet
        if lang_bool == True and retweet_bool: # if conditions are met write to file
            file_out.write( json.dumps( {'text': json_parsed['text']} ).encode() ) # write dict to file
            file_out.write( '\n'.encode() ) # write a newline char to the end
            #print(i, json_parsed['text'], '\n')
file_in.close()
file_out.close()

CPU times: user 55.8 ms, sys: 6.6 ms, total: 62.4 ms
Wall time: 63.5 ms


In [15]:
!ls

'Fortunato - 2009 - Community detection in graphs.pdf'
'Gerlach, Peixoto, Altmann - 2018 - A network approach to topic models.pdf'
 read_filter_data-Copy1.ipynb
 read_filter_data-Copy2.ipynb
 read_filter_data-Copy3.ipynb
 read_filter_data.ipynb
 read_filter_data_saved.ipynb
 testoutput.txt.gz
 Twitter_exercise_solution.ipynb
 twitter_exercises_solution.ipynb
 World_171016_103536_filtered.txt.gz
 World_171016_103536.txt.gz
 World_171016_111937_filtered.txt
 World_171016_111937_filtered.txt.gz
 World_171016_111937.txt.gz


#### Check if file is correctly written

In [16]:
with io.TextIOWrapper( io.BufferedReader(gzip.open('testoutput.txt.gz')) ) as file_in:
    for line in file_in: # read one line at a time from a compressed file
        print(json.loads(line)) # parse only that one line to json

{'text': 'Pray for Somali'}
{'text': 'Corrupted\nAlerta Rojo Tee\nTag: Fotl Heavy Cotton\nOnly 1 Small size left at jakarta store\nIDR… https://t.co/JAoznm9MzM'}
{'text': 'temperature down 69°F -&gt; 65°F\nhumidity down 88% -&gt; 82%\nwind 7mph -&gt; 10mph'}
{'text': "With all due respect Your Holiness; don't support #GMOs. #LeptisMagna in #Libya was the breadbasket for #Europe. Lo… https://t.co/zwTYvJu0As"}
{'text': "@Miss_Sashi @EthanVanSciver Sadly, no. It's to the tune of the chant for the NHL's Arizona Coyotes"}
{'text': 'Wah, lidah please👍 https://t.co/vBMDTYWlGw'}
{'text': "I've got a hole in my life that only a dog can fill :("}
{'text': 'Lamb heart-Organ donation from my meal @alchemistcph with girlloveschefs the other night. A very… https://t.co/eyaMzYiBJL'}
{'text': 'Quality Street®, @qualitystreetuk is now trending in United Kingdom\n\nhttps://t.co/iz28l4ll2U https://t.co/NkMA7iOK50'}
{'text': '@adamziek VERY distracting!'}
{'text': "@Frank_Monkey @Tam83610981 @JORISLUIJEND

### Up-scale I/O operations

In [17]:
def do_filtering( filename_in, filename_out ):
    #filename_out = filename_in[:-7]+'_filtered.txt.gz'
    print('Input:', filename_in, '\t', 'Output:', filename_out)
    file_out = gzip.open(filename_out, mode='wb')

    with io.TextIOWrapper( io.BufferedReader(gzip.open(filename_in)) ) as file_in:
        for line in file_in: # read one line at a time from a compressed file
            json_parsed = json.loads(line) # parse only that one line to json
            lang_bool = English_language_filtering(json_parsed) # True: English, False: not English
            retweet_bool = 'retweeted_status' not in list(json_parsed.keys()) # True: not retweet, False: retweet
            if lang_bool == True and retweet_bool: # if conditions are met write to file
                file_out.write( json.dumps( {'text': json_parsed['text']} ).encode() ) # write dict to file
                file_out.write( '\n'.encode() ) # write a newline char to the end
                #print(i, json_parsed['text'], '\n')
    file_in.close()
    file_out.close()

### Getting all file names in the directory

In [18]:
from os import listdir
from os.path import isfile, join

In [19]:
dir_path = '/media/abiricz/Elements/twitter/data/'
target_path = '/media/abiricz/Elements/twitter/filtered/'
dir_files = [f for f in listdir(dir_path)]

In [20]:
len(dir_files), dir_files[0:3]

(3595,
 ['World_171016_103536.txt.gz',
  'World_171016_104514.txt.gz',
  'World_171016_104518.txt.gz'])

In [29]:
dir_files[130], dir_files[899]

('World_180111_184722.txt.gz', 'World_180818_145807.txt.gz')

In [30]:
a = 30
b = 130

In [31]:
%%time
for i in dir_files[a:b]:
    print('Currently working on:', i)
    %time do_filtering( dir_path+i, target_path+i[:-7]+'_filtered.txt.gz' )

Currently working on: World_171107_135932.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171107_135932.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171107_135932_filtered.txt.gz
CPU times: user 3min 5s, sys: 576 ms, total: 3min 6s
Wall time: 3min 7s
Currently working on: World_171107_235214.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171107_235214.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171107_235214_filtered.txt.gz
CPU times: user 7min 5s, sys: 1.49 s, total: 7min 6s
Wall time: 7min 11s
Currently working on: World_171108_235214.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171108_235214.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171108_235214_filtered.txt.gz
CPU times: user 1.62 s, sys: 7.98 ms, total: 1.63 s
Wall time: 1.67 s
Currently working on: World_171108_235720.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171108_235720.txt.gz 	 Output: /media/abiricz/Elements/twitter/fi

CPU times: user 6min 30s, sys: 1.6 s, total: 6min 31s
Wall time: 6min 41s
Currently working on: World_171206_194509.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171206_194509.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171206_194509_filtered.txt.gz
CPU times: user 60.2 ms, sys: 1e+03 ns, total: 60.2 ms
Wall time: 105 ms
Currently working on: World_171217_180727.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171217_180727.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171217_180727_filtered.txt.gz
CPU times: user 6min 41s, sys: 1.61 s, total: 6min 43s
Wall time: 6min 51s
Currently working on: World_171228_234130.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171228_234130.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171228_234130_filtered.txt.gz
CPU times: user 6min 49s, sys: 1.58 s, total: 6min 51s
Wall time: 7min 2s
Currently working on: World_180108_212729.txt.gz
Input: /media/abiricz/Elements/twi

CPU times: user 6min 25s, sys: 1.26 s, total: 6min 26s
Wall time: 6min 33s
Currently working on: World_171211_035853.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171211_035853.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171211_035853_filtered.txt.gz
CPU times: user 4min 42s, sys: 820 ms, total: 4min 43s
Wall time: 4min 48s
Currently working on: World_171211_220154.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171211_220154.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171211_220154_filtered.txt.gz
CPU times: user 5min 16s, sys: 1.01 s, total: 5min 17s
Wall time: 5min 24s
Currently working on: World_171212_180727.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171212_180727.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_171212_180727_filtered.txt.gz
CPU times: user 6min 32s, sys: 1.25 s, total: 6min 33s
Wall time: 6min 43s
Currently working on: World_171213_180727.txt.gz
Input: /media/abiricz/Elements

CPU times: user 6min 21s, sys: 1.13 s, total: 6min 22s
Wall time: 6min 24s
Currently working on: World_180107_031543.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_180107_031543.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_180107_031543_filtered.txt.gz
CPU times: user 6min 20s, sys: 1.15 s, total: 6min 21s
Wall time: 6min 23s
Currently working on: World_180108_031543.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_180108_031543.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_180108_031543_filtered.txt.gz
CPU times: user 4min 19s, sys: 792 ms, total: 4min 19s
Wall time: 4min 21s
Currently working on: World_180108_200909.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_180108_200909.txt.gz 	 Output: /media/abiricz/Elements/twitter/filtered/World_180108_200909_filtered.txt.gz
CPU times: user 19.7 s, sys: 68 ms, total: 19.8 s
Wall time: 20 s
Currently working on: World_180108_211744.txt.gz
Input: /media/abiricz/Elements/twitter/