In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords 
import re
import json
import gzip
import csv
import io
from polyglot.detect import Detector
import icu
from pprint import pprint
import datetime

https://www.nltk.org/api/nltk.tokenize.html

In [2]:
# Special tokenizer for tweets
tweet_tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

# Stemmer
snow = SnowballStemmer( 'english', ignore_stopwords=True )
lemma = nltk.wordnet.WordNetLemmatizer()

# Stop words
stop_words = set(stopwords.words('english')) 

In [3]:
def text_tokenizer( text ):
    tokenized_text = []
    
    tokenized = tweet_tokenizer.tokenize(text)
    for item in tokenized:
        match = re.match('#?[A-Za-z]+', item)
        if(match):
            result = match.group(0)
            url_filter = re.match('https?', result)
            #letter_filter = re.match('[A-Za-z]', result) # any other match can be done this way
            if( url_filter == None ):
                real_word = re.findall('\w+', result)[0]
                tokenized_text.append(real_word)
    #print( [ x for x in tokenized_text if ( len(x) > 1 ) ] )
    return [ x for x in tokenized_text if ( len(x) > 1 ) ] # REMOVE words that are only 1 letter long

In [4]:
def text_stemmer( tokenized_text ):
    return [ snow.stem(word) for word in tokenized_text ]

In [5]:
def text_no_stopwords( tokenized_text ):
    return [ w for w in tokenized_text if not w in stop_words ] 

In [6]:
def language_detector(text):
    '''Returns language type as string or 'un' if unknown.'''
    try:
        polyglot = Detector(text, quiet=True).languages[0]
        if polyglot.confidence > 90: # threshold for detecting languages
            lang = str(polyglot.locale)
        else:
            lang = 'un'
    except Exception as e:
        print("Error: " + str(e))
        lang = 'un' # if error occurs during conversion language is unknown
    # print('Guess: ', lang)
    return lang

In [7]:
def English_language_filtering( text ):
    lang = language_detector(text)
    if( lang == 'en' ):
        return True
    else:
        return False

#### @, #, numbers, and non-alphabetic characters are filtered, then the text is tokenized

In [8]:
test_tokens = text_tokenizer( text = "3 class classes With f u i all due respect Your Holiness; don't support #GMOs. #LeptisMagna in #Libya was the breadbasket for #Europe. Lo… https://t.co/zwTYvJu0As")
test_tokens

['class',
 'classes',
 'with',
 'all',
 'due',
 'respect',
 'your',
 'holiness',
 'don',
 'support',
 'gmos',
 'leptismagna',
 'in',
 'libya',
 'was',
 'the',
 'breadbasket',
 'for',
 'europe',
 'lo']

#### Language detection works better after text is filtered

In [9]:
# tokens are rejoined, then language is checked
English_language_filtering( ' '.join(test_tokens) )

True

#### Stemming
https://stackoverflow.com/questions/24647400/what-is-the-best-stemming-method-in-python

In [10]:
snow = SnowballStemmer( 'english', ignore_stopwords=True )
for word in test_tokens:
    print( word,'->',snow.stem(word) )

class -> class
classes -> class
with -> with
all -> all
due -> due
respect -> respect
your -> your
holiness -> holi
don -> don
support -> support
gmos -> gmos
leptismagna -> leptismagna
in -> in
libya -> libya
was -> was
the -> the
breadbasket -> breadbasket
for -> for
europe -> europ
lo -> lo


In [11]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/abiricz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
lemma = nltk.wordnet.WordNetLemmatizer()
for word in test_tokens:
    print( word,'->', lemma.lemmatize(word) )

class -> class
classes -> class
with -> with
all -> all
due -> due
respect -> respect
your -> your
holiness -> holiness
don -> don
support -> support
gmos -> gmos
leptismagna -> leptismagna
in -> in
libya -> libya
was -> wa
the -> the
breadbasket -> breadbasket
for -> for
europe -> europe
lo -> lo


#### Filter stop words

In [13]:
text_no_stopwords( test_tokens )

['class',
 'classes',
 'due',
 'respect',
 'holiness',
 'support',
 'gmos',
 'leptismagna',
 'libya',
 'breadbasket',
 'europe',
 'lo']

In [14]:
def do_tweet_processing( filename_in, filename_out ):
    print('Input:', filename_in, '\t', 'Output:', filename_out)
    i = 0
    file_out = gzip.open(filename_out, mode='wb')
    acctime = 0

    with io.TextIOWrapper( io.BufferedReader(gzip.open(filename_in)) ) as file_in:
        for line in file_in: # read one line at a time from a compressed file
            t1 = datetime.datetime.now()
            json_parsed = json.loads(line) # parse only that one line to json
            tweet = json_parsed['text'] # get tweet message
            
            tweet_tokenized = text_tokenizer(tweet) # tokenize tweet message
            tweet_stemmed = text_stemmer( tweet_tokenized ) # stem tokenized words
            tweet_filtered = text_no_stopwords( tweet_stemmed ) # remove stop words
            
            lang_bool = English_language_filtering( ' '.join(tweet_tokenized) ) # True: English, False: not English
            
            # check process status
            if i % 1000000 == 0:
                print('status:', i, ', elapsed time:', np.round(acctime*0.001/60/60, 3), 'hour(s)', ', time to finish:', np.round(acctime*(10**8-i)/(i+1)*0.001/60/60, 1), 'hour(s)' )
            if lang_bool == True: # if conditions are met write to file
                file_out.write( ' '.join(tweet_filtered).encode() ) # write to file
                file_out.write( '\n'.encode() ) # write a newline char to the end
            t2 = datetime.datetime.now()
            dt = (t2-t1).microseconds*0.001
            acctime = acctime + dt 
            i += 1
    file_in.close()
    file_out.close()

In [15]:
%%time
do_tweet_processing( 'World_twitterdata_part1.txt.gz', 'World_twitterdata_ready_part1.txt.gz' )

Input: World_twitterdata_part1.txt.gz 	 Output: World_twitterdata_ready_part1.txt.gz
status: 0 , elapsed time: 0.0 hour(s) , time to finish: 0.0 hour(s)
status: 1000000 , elapsed time: 0.091 hour(s) , time to finish: 9.0 hour(s)
status: 2000000 , elapsed time: 0.179 hour(s) , time to finish: 8.8 hour(s)
status: 3000000 , elapsed time: 0.268 hour(s) , time to finish: 8.7 hour(s)
status: 4000000 , elapsed time: 0.361 hour(s) , time to finish: 8.7 hour(s)
status: 5000000 , elapsed time: 0.444 hour(s) , time to finish: 8.4 hour(s)
status: 6000000 , elapsed time: 0.53 hour(s) , time to finish: 8.3 hour(s)
status: 7000000 , elapsed time: 0.617 hour(s) , time to finish: 8.2 hour(s)
status: 8000000 , elapsed time: 0.7 hour(s) , time to finish: 8.1 hour(s)
status: 9000000 , elapsed time: 0.789 hour(s) , time to finish: 8.0 hour(s)
status: 10000000 , elapsed time: 0.874 hour(s) , time to finish: 7.9 hour(s)
status: 11000000 , elapsed time: 0.957 hour(s) , time to finish: 7.7 hour(s)
status: 1200