In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
import re
import gzip

### Read the data

In [2]:
!ls ../Data_process

all_nodupl.txt.gz   each_file_no_dupl.zip  keyfilter2  market.txt.gz
all.txt.gz	    each_file.zip	   keyfilter3  nfl.txt.gz
cinema.txt.gz	    election.txt.gz	   keyfilter4  physics.txt.gz
climate.txt.gz	    hungary.txt.gz	   keyfilter5  terror.txt.gz
concert.txt.gz	    keyfilter0		   keyfilter6
data_mining.txt.gz  keyfilter1		   keyfilter7


In [3]:
%%time
data = pd.read_csv( '../Data_process/all_nodupl.txt.gz', names=['Tweets'], compression='gzip' )

CPU times: user 3.33 s, sys: 170 ms, total: 3.5 s
Wall time: 3.5 s


In [4]:
data.head(), data.tail()

(                                              Tweets
 0  aaaaaaaaaaaaaa reveng of the sith palaba sa ci...
 1  aaaaaaaaaahhhhhh can hide my excit be see you ...
 2  aaaaaaaa could paus stan ikon don wanna be sad...
 3                                aaaaaaa miss physic
 4  aaaaaa grabe thank you lord afford po ticket p...,
                                                     Tweets
 2289719  zzzquil is benadryl diphenhydramin the same ex...
 2289720  zzz select data use to promulg climat antarct ...
 2289721    zzzz and peopl rip on mlb and nfl replay review
 2289722  zzzz como on mike for bcash to do wel you don ...
 2289723  zzzz the climat is primarili dictat by solar c...)

### Data processing

In [5]:
data_list = data.values.tolist()

In [6]:
%%time
list_of_words = []
for i in range( len(data_list) ):
    list_of_words.append( data_list[i][0].split() )

CPU times: user 3.73 s, sys: 500 ms, total: 4.23 s
Wall time: 4.23 s


In [7]:
np.shape( list_of_words )

(2289724,)

#### Filtering other kind of duplicates (duplicates are already filtered with uniq after sort)

In [8]:
%%time
list_of_words_filtered = []
for i in range( 1, len(list_of_words) ):
    if len(list_of_words[i]) == len(list_of_words[i-1])+1: # extra word at the end
        if ' '.join(list_of_words[i][:-1]) == ' '.join(list_of_words[i-1]): # difference only in last word
            next # skip this line
    else:
        list_of_words_filtered.append( list_of_words[i] )

CPU times: user 982 ms, sys: 0 ns, total: 982 ms
Wall time: 983 ms


In [9]:
np.shape( list_of_words_filtered )

(2124173,)

In [10]:
#file = open( '../Data/All_filtered.txt', mode='w' )

In [11]:
#%%time
#for i in range( len(list_of_words_filtered) ):
#    file.write( ' '.join(list_of_words_filtered[i]) )
#    file.write( '\n' )

### Create suitable sized dataset from parts

#### Locate and load

In [12]:
filenames = [ 'cinema', 'climate', 'concert', 'data_mining', 'election', 
             'hungary', 'market', 'nfl', 'physics', 'terror' ]
dirnames = [ 'keyfilter0', 'keyfilter1', 'keyfilter2', 'keyfilter3', 'keyfilter4', 
            'keyfilter5', 'keyfilter6', 'keyfilter7']

In [13]:
data = pd.read_csv( '../Data_process/'+dirnames[0]+'/'+filenames[0]+'.txt', names=['Tweets'] )

In [14]:
data.head()

Unnamed: 0,Tweets
0,good morn yeah had great weekend got day off a...
1,victoria and muhammad sf world cinema in pathu...
2,at golden screen cinema gsc in melaka
3,youth team at work earli this morn on set at l...
4,woman just came and sat right next to me in th...


#### Filter exact duplicate lines

In [15]:
np.shape(data['Tweets']), np.shape( data['Tweets'].unique() ), data['Tweets'].unique()[0]

((19681,),
 (17458,),
 'good morn yeah had great weekend got day off and enjoy the beuti weather and went to the cinemar')

In [16]:
%%time
list_of_words = []
data_unique = data['Tweets'].unique()
for i in range( len(data_unique) ):
    list_of_words.append( data_unique[i].split() )
print( list_of_words[0] )

['good', 'morn', 'yeah', 'had', 'great', 'weekend', 'got', 'day', 'off', 'and', 'enjoy', 'the', 'beuti', 'weather', 'and', 'went', 'to', 'the', 'cinemar']
CPU times: user 227 ms, sys: 0 ns, total: 227 ms
Wall time: 225 ms


#### Filter same tweets differing only in the last word

In [17]:
%%time
list_of_words_filtered = []
list_of_words_filtered.append( list_of_words[0] ) # add first row by hand
for i in range( 1, len(list_of_words) ):
    if len(list_of_words[i]) == len(list_of_words[i-1])+1: # extra word at the end
        if ' '.join(list_of_words[i][:-1]) == ' '.join(list_of_words[i-1]): # difference only in last word
            next # skip this line
    else:
        list_of_words_filtered.append( list_of_words[i] )

CPU times: user 560 ms, sys: 116 ms, total: 676 ms
Wall time: 673 ms


In [18]:
np.shape( list_of_words_filtered ), list_of_words_filtered[0]

((16445,),
 ['good',
  'morn',
  'yeah',
  'had',
  'great',
  'weekend',
  'got',
  'day',
  'off',
  'and',
  'enjoy',
  'the',
  'beuti',
  'weather',
  'and',
  'went',
  'to',
  'the',
  'cinemar'])

#### Filter stopwords

In [20]:
# Stop words
stop_words = set(stopwords.words('english')) 

In [21]:
def text_no_stopwords( tokenized_text ):
    return [ w for w in tokenized_text if not w in stop_words ] 

In [22]:
text_no_stopwords( list_of_words_filtered[0] ) # remove stop words

['good',
 'morn',
 'yeah',
 'great',
 'weekend',
 'got',
 'day',
 'enjoy',
 'beuti',
 'weather',
 'went',
 'cinemar']

### Construct functions

In [24]:
def load_process_save( file_input, file_output, topic, every=1 ):
    data = pd.read_csv( file_input, names=['Tweets'] )
    
    # Filter exact duplicate lines
    list_of_words = []
    data_unique = data['Tweets'].unique()
    for i in range( len(data_unique) ):
        list_of_words.append( data_unique[i].split() )
    
    # Filter same tweets differing only in the last word
    list_of_words_filtered = []
    list_of_words_filtered.append( list_of_words[0] ) # add first row by hand
    for i in range( 1, len(list_of_words) ):
        if len(list_of_words[i]) == len(list_of_words[i-1])+1: # extra word at the end
            if ' '.join(list_of_words[i][:-1]) == ' '.join(list_of_words[i-1]): # difference only in last word
                next # skip this line
        else:
            list_of_words_filtered.append( list_of_words[i] )
    
    # Remove stopwords
    for i in range( len(list_of_words_filtered) ):
        list_of_words_filtered[i] = text_no_stopwords( list_of_words_filtered[i] )
    
    # Write to file
    for i in range( len(list_of_words_filtered) ):
        if i % every == 0:
            file_output.write( ' '.join(list_of_words_filtered[i]).encode() ) # tweet
            file_output.write( ('\t'+topic).encode() ) # target
            file_output.write( '\n'.encode() ) # newline

#### Write each topic to separate files gzipped

In [25]:
%%time
# open all output files
file_output_list = []
for i in filenames: 
    file_output_list.append( gzip.open('../Data_process/'+i+'_nostop.txt.gz', mode='wb') )

# do the process
for k in range( len(dirnames) ):
    for l in range( len(filenames) ):
        load_process_save( file_input='../Data_process/'+dirnames[k]+'/'+filenames[l]+'.txt', 
                          file_output=file_output_list[l], 
                          topic=filenames[l],
                          every=1 )
# close all output files
for i in file_output_list:
    i.close()

CPU times: user 49.1 s, sys: 62.4 ms, total: 49.2 s
Wall time: 49.6 s


#### Write each topic to one file (only EVERY 10 tweet is written to file: $2 \cdot 10^5$ documents kept)

In [53]:
%%time
# open the output file
file_output = gzip.open('../Data/Few_nostop.txt.gz', mode='wb')

# do the process
for k in range( len(dirnames) ):
    for l in range( len(filenames) ):
        load_process_save( file_input='../Data_process/'+dirnames[k]+'/'+filenames[l]+'.txt', 
                          file_output=file_output, 
                          topic=filenames[l],
                          every=100 )
# close the output file
file_output.close()

CPU times: user 14.1 s, sys: 76.6 ms, total: 14.2 s
Wall time: 14.3 s


#### After uniting the files it is needed to filter duplicates again

Unzipping

In [54]:
!gunzip ../Data/Few_nostop.txt.gz

In [55]:
!wc -l ../Data/Few_nostop.txt

22113 ../Data/Few_nostop.txt


In [56]:
!ls ../Data/Few*

../Data/Few_filtered.txt.gz  ../Data/Few_nostop.txt


Printing the last 20 tweets to console

In [57]:
!sort ../Data/Few_nostop.txt | uniq | cat | tail -n 20

zac brown band extrem generous perform benefit concert gatlinb	concert
zack snyder cinematograph though work larri fong except cinematograph	cinema
zanfona nois concert atlanta ashevill negativeland insid proyect nigthmar noi	concert
zanu primari elect figur drop signific term number voter particip co	election
zavala world zavala world elect time excel	election
zeke could process clear name expos nfl	nfl
zero desir watch nfl hand ne anoth sb	nfl
zero emot invest marvel cinemat univers super conveni would recommend	cinema
zero evid outcom elect	election
zero nra member commit terrorist event mass shoot terrorist attack	terror
zilker media digit market team confer week follow	market
zimbabw armi bloodless coup armi pave way democrat fraud free peac elect	election
zimbo master trade onlin elect analysi constitut law cholera epidermiolog	election
zim elect joint iri ndi elect observ mission statement constitut court decis	election
zoe duck watch parti ducksthehallstmp edward 

#### Finally save the file

In [58]:
!sort ../Data/Few_nostop.txt | uniq | cat >> ../Data/Few_filtered_nostop.txt

In [59]:
!wc -l ../Data/Few_nostop.txt

22113 ../Data/Few_nostop.txt


In [60]:
!wc -l ../Data/Few_filtered_nostop.txt

22052 ../Data/Few_filtered_nostop.txt


In [61]:
!rm ../Data/Few_nostop.txt

In [62]:
!gzip ../Data/Few_filtered_nostop.txt

#### Check if everything is done correctly

In [63]:
data = pd.read_csv( '../Data/Few_filtered_nostop.txt.gz', names=['Tweets', 'Targets'], 
                   compression='gzip', delimiter='\t' )

In [64]:
data.head()

Unnamed: 0,Tweets,Targets
0,aaaand back twittertimeout funni nazi like wan...,hungary
1,aadhar link mobil bank mandatori aadhar link v...,election
2,aa japan new kenyan market robust partnership ...,market
3,aam jan win elect elect,election
4,aaog aaog anyday market realis drill nail cert...,market


In [52]:
np.shape(data)

(218640, 2)