In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
import re
import json
import gzip
import csv
import io
from polyglot.detect import Detector
import icu
from pprint import pprint
import datetime
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from os import listdir
from os.path import isfile, join

### Choose thread:

In [2]:
thread = 0

### Notes on running the notebook


#### Source for reading in the gzipped text file:
https://stackoverflow.com/questions/23091770/how-do-you-iterate-through-a-gzipped-carriage-return-file-using-python-2-7

https://www.makeuseof.com/tag/json-python-parsing-simple-guide/

### Actual:

#### Time needed for 2.0 GB file: 20 min -> around 100 CPU hours for all!

#### --

### Previous:

#### Only reading the file and count its rows: 

* CPU times: user 1min 8s, sys: 656 ms, total: 1min 9s.
* Wall time: <font color='red'>1min 9s</font>, approx. <font color='green'>30 MB/s</font>.

        %%time
        i=0
        with io.TextIOWrapper( io.BufferedReader(gzip.open(filename)) ) as file:
            for line in file:
                i += 1
        file.close()
        print(i)


#### Reading in compressed zip and parse lines to json

* CPU times: user 4min 36s, sys: 886 ms, total: 4min 37s.
* Wall time: <font color='red'>4min 37s</font>, approx. <font color='green'>7.5 MB/s</font>.

        %%time
        with io.TextIOWrapper( io.BufferedReader(gzip.open(filename)) ) as file:
            for line in file:
                json_parsed = json.loads(line)
        file.close()

Number of rows: 3.764.667, file size (compressed): 2.0 GB (1.989.239.557 bytes).

#### Doing the filtering process and saving back to a compressed text file
* Input: World_171016_111937.txt.gz. output: World_171016_111937_filtered.txt.gz.

* CPU times: user 6min 36s, sys: 1.12 s, total: 6min 37s.
* Wall time: <font color='red'>6min 38s</font>,  approx. <font color='green'>5 MB/s</font>.

* Input: number of rows: 3.764.667, file size (compressed): 2.0 GB (1.989.239.557 bytes).
* Output: number of rows: 1.588.596, only text kept, file size (compressed): 66.5 MB (66.549.841 bytes).

#### Doing the filtering process for all the compressed text files
* Input, output: each separate file is read, filtered and then saved to compressed txt.
* According to the estimations 1 thread works with a speed of <font color='green'>5 MB/s</font>, so the <b><font color='red'>CPU time required is about 35-40 hours</font></b>.
* <b><font color='brown'>4 jupyter kernels are launched with a given range of filenames to work on to speed up the process.</font></b>
* <b><font color='purple'>The memory consumption of each kernel is about 100 MB</font></b>!
* After the process ended, the CPU time of 1 thread (the first 1-2 hours not shown, because threads had to be rearranged):

    * CPU times: user 8h 25min 50s, sys: 1min 47s, total: 8h 27min 37s.
    * Wall time: <font color='red'>8h 35min 59s</font>.
* Output after merging (wall time: <font color='red'>1h 44min 16s</font>): number of rows: <font color='red'>492.866.843</font>, file size (compressed): 20.7 GB (20.690.310.158 bytes).

#### Number of twitter messages to build the network with: <font color='blue'>492.866.843</font>.

#### --

#### Source for language detection:

https://stackoverflow.com/questions/39142778/python-how-to-determine-the-language

##### Package installation:
https://polyglot.readthedocs.io/en/latest/Installation.html

#### Removing urls:
https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python

### Data folder

In [3]:
! ls -s /media/abiricz/Elements/twitter/data/ | tail

    484 World_181119_100900.txt.gz
    268 World_181119_100944.txt.gz
    332 World_181119_101008.txt.gz
    184 World_181119_101040.txt.gz
    340 World_181119_101103.txt.gz
    580 World_181119_101127.txt.gz
    584 World_181119_101224.txt.gz
   1752 World_181119_101308.txt.gz
   1604 World_181119_101536.txt.gz
    348 World_181119_101755.txt.gz


In [4]:
# small file for testing
filename = '/media/abiricz/Elements/twitter/data/World_171016_103536.txt.gz'
filename

'/media/abiricz/Elements/twitter/data/World_171016_103536.txt.gz'

### Test reading

In [5]:
%%time
with io.TextIOWrapper( io.BufferedReader(gzip.open(filename)) ) as file:
    for line in file:
        json_parsed = json.loads(line)
file.close()

CPU times: user 37.9 ms, sys: 1.16 ms, total: 39.1 ms
Wall time: 39.2 ms


### Explore the data

In [6]:
json_parsed.keys()

dict_keys(['created_at', 'id', 'id_str', 'text', 'display_text_range', 'source', 'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'quote_count', 'reply_count', 'retweet_count', 'favorite_count', 'entities', 'extended_entities', 'favorited', 'retweeted', 'possibly_sensitive', 'filter_level', 'lang', 'timestamp_ms'])

In [7]:
print(json_parsed['place']['country'])

Republic of Korea


In [8]:
json_parsed['place']['bounding_box']['coordinates']

[[[126.954584, 37.560865],
  [126.954584, 37.632942],
  [127.020042, 37.632942],
  [127.020042, 37.560865]]]

In [9]:
tweet = json_parsed['text']
type(tweet), tweet

(str,
 '다 좋은데, 사무실 들어가다 갑자기 만나 시위대 행진으로 버스가 안다님. 계속 대기중. #경복궁역 https://t.co/dfAjFATB47')

#### Filtering out url links from tweets

In [10]:
tweet = re.sub(r'http\S+', '', tweet)
tweet

'다 좋은데, 사무실 들어가다 갑자기 만나 시위대 행진으로 버스가 안다님. 계속 대기중. #경복궁역 '

#### Checking language detector

In [11]:
for language in Detector(tweet).languages:
        print(language)

name: Korean      code: ko       confidence:  99.0 read bytes:  3676
name: un          code: un       confidence:   0.0 read bytes:     0
name: un          code: un       confidence:   0.0 read bytes:     0


In [12]:
str(Detector(tweet).languages[0].locale ) # best guess, if 'un' then unknown

'ko'

In [13]:
json_parsed['lang'] # this info comes from the tweet

'ko'

In [14]:
Detector(tweet).languages[0].confidence # confidence of best guess

99.0

#### Removing retweets this way
https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/tweet-object
"Users can amplify the broadcast of Tweets authored by other users by retweeting . Retweets can be distinguished from typical Tweets by the existence of a <font color='red'>retweeted_status</font> attribute. This attribute contains a representation of the original Tweet that was retweeted. Note that retweets of retweets do not show representations of the intermediary retweet, but only the original Tweet. (Users can also unretweet a retweet they created by deleting their retweet.)"

#### Turned out, keeping them, but filtering duplicates yields better results!

In [15]:
'retweeted_status' in list(json_parsed.keys()) # indicator if tweet is a retweet

False

### Scale the filtering process to be able to work with large files -> functions

#### Language detection

In [16]:
def language_detector(text):
    '''Returns language type as string or 'un' if unknown.'''
    try:
        polyglot = Detector(text, quiet=True).languages[0]
        if polyglot.confidence > 90: # threshold for detecting languages
            lang = str(polyglot.locale)
        else:
            lang = 'un'
    except Exception as e:
        print("Error: " + str(e))
        lang = 'un' # if error occurs during conversion language is unknown
    # print('Guess: ', lang)
    return lang

In [17]:
def English_language_filtering( text ):
    lang = language_detector(text)
    if( lang == 'en' ):
        return True
    else:
        return False

#### Natural language processing

https://stackoverflow.com/questions/24647400/what-is-the-best-stemming-method-in-python

In [18]:
# Special tokenizer for tweets
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

# Stemmer
snow = SnowballStemmer( 'english', ignore_stopwords=True )
lemma = nltk.wordnet.WordNetLemmatizer()

# Stop words
stop_words = set(stopwords.words('english')) 

In [19]:
keywords_regex = ['nfl', 'hungary', 'climate', 'election', 'physics', 'concert', 
            'market', 'data\s?mining', 'cinema', 'terror']

In [20]:
#text_tokenized input: one tweet, output: list of its normal words
def text_tokenizer( text ):
    tokenized_text = []
    keyword_match_marker = []    # will contain the exact indices of the matching keywords in the tweet
    
    tokenized = tweet_tokenizer.tokenize(text)
    for item in tokenized:
        match = re.match('#?[A-Za-z]+', item)
        if(match):
            result = match.group(0) # gives back the matching string
            # searching for keywords
            i = 0    # will mark the index of the matching keyword in the keywords_regex list
            for word in keywords_regex:
                word_match = re.match(word, result)
                if (word_match):
                    keyword_match_marker.append(i)
                i += 1
            
            url_filter = re.match('https?', result)
            if( url_filter == None ):
                real_word = re.findall('\w+', result)[0]
                tokenized_text.append(real_word)
    return keyword_match_marker, [ x for x in tokenized_text if ( len(x) > 1 ) ] # REMOVE words that are only 1 letter long

In [21]:
def text_stemmer( tokenized_text ):
    return [ snow.stem(word) for word in tokenized_text ]

In [22]:
def text_no_stopwords( tokenized_text ):
    return [ w for w in tokenized_text if not w in stop_words ] 

#### Summary: @, #, numbers, and non-alphabetic characters are filtered, then the text is tokenized

### Main function

In [23]:
def do_tweet_processing( filename_in, out_files ):
    print('Input:', filename_in)

    with io.TextIOWrapper( io.BufferedReader(gzip.open(filename_in)) ) as file_in:
        for line in file_in: # read one line at a time from a compressed file
            json_parsed = json.loads(line) # parse only that one line to json
            tweet = json_parsed['text'] # get tweet message
            
            keyword_match_marker, tweet_tokenized = text_tokenizer(tweet) # tokenize tweet message
            tweet_stemmed = text_stemmer( tweet_tokenized ) # stem tokenized words
            # tweet_filtered = text_no_stopwords( tweet_stemmed ) # remove stop words
            
            lang_bool = English_language_filtering( ' '.join(tweet_tokenized) ) # True: English, False: not English
            
            if ( lang_bool == True and json_parsed['lang'] == 'en' and 
                len(keyword_match_marker) != 0 ): # if conditions are met write to file
                
                for index in keyword_match_marker:
                    out_files[index].write( ' '.join(tweet_stemmed).encode() ) # write to file
                    out_files[index].write( '\n'.encode() ) # write a newline char to the end
            
    file_in.close()

### Scale I/O operations

#### Getting all file names in the directory

In [24]:
dir_path = '/media/abiricz/Elements/twitter/data/'
target_path = '/media/abiricz/Elements/twitter/keyfilter'+str(thread)+'/'
dir_files = [f for f in listdir(dir_path)]

### Generating output files for each keyword

keywords_regex = ['nfl', 'hungary', 'climate', 'election', 'physics', 'concert', 
            'market', 'data\s?mining', 'cinema', 'terror']

In [25]:
file_nfl = gzip.open(target_path+'nfl.txt.gz', mode='wb')
file_hungary = gzip.open(target_path+'hungary.txt.gz', mode='wb')
file_climate = gzip.open(target_path+'climate.txt.gz', mode='wb')
file_election = gzip.open(target_path+'election.txt.gz', mode='wb')
file_physics = gzip.open(target_path+'physics.txt.gz', mode='wb')
file_concert = gzip.open(target_path+'concert.txt.gz', mode='wb')
file_market = gzip.open(target_path+'market.txt.gz', mode='wb')
file_data_mining = gzip.open(target_path+'data_mining.txt.gz', mode='wb')
file_cinema = gzip.open(target_path+'cinema.txt.gz', mode='wb')
file_terror = gzip.open(target_path+'terror.txt.gz', mode='wb')

out_files = [file_nfl, file_hungary, file_climate, file_election, file_physics, file_concert,
             file_market, file_data_mining, file_cinema, file_terror]

In [26]:
len(dir_files), dir_files[0:3]

(3595,
 ['World_171016_103536.txt.gz',
  'World_171016_104514.txt.gz',
  'World_171016_104518.txt.gz'])

In [27]:
dir_files[58]

'World_171116_003527.txt.gz'

In [28]:
a = 0
b = 58

In [29]:
%%time
for i in dir_files[a:b]:
    print('Currently working on:', i)
    %time do_tweet_processing( dir_path+i, out_files )

Currently working on: World_171016_103536.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171016_103536.txt.gz
CPU times: user 146 ms, sys: 0 ns, total: 146 ms
Wall time: 145 ms
Currently working on: World_171016_104514.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171016_104514.txt.gz
CPU times: user 30.1 ms, sys: 1.55 ms, total: 31.7 ms
Wall time: 31.9 ms
Currently working on: World_171016_104518.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171016_104518.txt.gz
CPU times: user 28.7 ms, sys: 0 ns, total: 28.7 ms
Wall time: 28.5 ms
Currently working on: World_171016_111937.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171016_111937.txt.gz
CPU times: user 32min 45s, sys: 4.29 s, total: 32min 49s
Wall time: 33min 31s
Currently working on: World_171017_111937.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171017_111937.txt.gz
CPU times: user 12min 31s, sys: 1.17 s, total: 12min 33s
Wall time: 12min 43s
Currently working on: World_171017

CPU times: user 29min 18s, sys: 2.81 s, total: 29min 21s
Wall time: 29min 36s
Currently working on: World_171117_205234.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171117_205234.txt.gz
CPU times: user 8.62 s, sys: 12 ms, total: 8.63 s
Wall time: 8.8 s
Currently working on: World_171117_205801.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171117_205801.txt.gz
CPU times: user 43.3 s, sys: 95.8 ms, total: 43.4 s
Wall time: 44 s
Currently working on: World_171117_212515.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171117_212515.txt.gz
CPU times: user 33min 52s, sys: 4.13 s, total: 33min 56s
Wall time: 34min 17s
Currently working on: World_171118_212515.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171118_212515.txt.gz
CPU times: user 33min 49s, sys: 4.01 s, total: 33min 53s
Wall time: 34min 12s
Currently working on: World_171119_212515.txt.gz
Input: /media/abiricz/Elements/twitter/data/World_171119_212515.txt.gz
CPU times: user 33min 42s, sys:

In [30]:
for file in out_files:
    file.close()