In [1]:
import json
import glob, os
import bz2
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np



In [2]:
 keywords = ["will donate", 
             "'ll donate", 
             "’ll donate", 
            "donate $",
            "#donate"]

In [3]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', -1)
    print(x)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

In [4]:
fullArchiveData = pd.read_csv('./fullArchiveTweets.csv', sep = '\t')

In [5]:
filenames = []

for root, dirs, files in os.walk("./"):
    for file in files:
        if file.endswith(".json.bz2"):
            filename = os.path.join(root, file) 
            filenames.append(filename)


In [6]:
def get_json (filename):
    newfilename = filename[:-4] # get rid of .bz3 file extension
    def load_json (filepath):
        data = []
        with open(filepath) as f:
            for line in f:
                data.append(json.loads(line))
        return data
    
    if os.path.exists(newfilename):
        return json_normalize(load_json(newfilename))
    else:
        zipfile = bz2.BZ2File(filename)
        data = zipfile.read()
        open(newfilename, 'wb').write(data)
        return json_normalize(load_json(newfilename))
        

In [7]:
def filter_df (df):
    df = df[df.lang == 'en'] # English tweets only
    df = df[df['retweeted_status.id'].isnull()] # Non retweets only
    df.reset_index(inplace=True)
    return df


In [8]:
def manage_text(df):
    copy = df
    for index, row in df.iterrows():
        if pd.isnull(row['extended_tweet.full_text']):
            copy.at[index,'extended_tweet.full_text'] = row['text']
    copy =  df[pd.notnull(copy['extended_tweet.full_text'])]
    return copy

In [9]:
def search_df (df):
    print ("start search_df: ", len(df))
    result  = pd.DataFrame()
    for keyword in keywords:
        copy = df[df['extended_tweet.full_text'].str.contains(r'\s*\b'+keyword+r'\W\s*', case=False, regex=True)]
        copy = copy[copy['extended_tweet.full_text'].str.contains('retweet|RT|like|each|every|re-tweet|follow', case=False)]
        result = pd.concat([copy, result])
    print ("search_df: ", len(result))
    return result



In [10]:
def process_df_from_filename(filename):
    df = get_json(filename)
    df = filter_df(df)
    df = manage_text(df)
    return search_df(df)

In [11]:
totalResult = pd.DataFrame()
for filename in filenames:
    print (filename)
    result = process_df_from_filename(filename)
    totalResult = pd.concat([result, totalResult])
    print (len(totalResult))
totalResult = totalResult.drop_duplicates(subset='id', keep="first")

./2017/07/01/03/23.json.bz2
start search_df:  230
search_df:  0
0
./2017/07/01/03/22.json.bz2
start search_df:  222
search_df:  0
0
./2017/07/01/03/54.json.bz2


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """


start search_df:  194
search_df:  0
0
./2017/07/01/03/55.json.bz2
start search_df:  231
search_df:  0
0
./2017/07/01/03/29.json.bz2
start search_df:  248
search_df:  0
0
./2017/07/01/03/28.json.bz2
start search_df:  246
search_df:  0
0
./2017/07/01/03/17.json.bz2
start search_df:  219
search_df:  0
0
./2017/07/01/03/16.json.bz2
start search_df:  208
search_df:  0
0
./2017/07/01/03/34.json.bz2
start search_df:  219
search_df:  0
0
./2017/07/01/03/35.json.bz2
start search_df:  219
search_df:  0
0
./2017/07/01/03/43.json.bz2
start search_df:  225
search_df:  0
0
./2017/07/01/03/42.json.bz2
start search_df:  205
search_df:  0
0
./2017/07/01/03/49.json.bz2
start search_df:  202
search_df:  0
0
./2017/07/01/03/48.json.bz2
start search_df:  182
search_df:  0
0
./2017/07/01/03/00.json.bz2
start search_df:  401
search_df:  0
0
./2017/07/01/03/01.json.bz2
start search_df:  332
search_df:  0
0
./2017/07/01/03/33.json.bz2
start search_df:  251
search_df:  0
0
./2017/07/01/03/32.json.bz2
start sear

start search_df:  259
search_df:  0
0
./2017/07/01/05/34.json.bz2
start search_df:  251
search_df:  0
0
./2017/07/01/05/35.json.bz2
start search_df:  244
search_df:  0
0
./2017/07/01/05/43.json.bz2
start search_df:  256
search_df:  0
0
./2017/07/01/05/42.json.bz2
start search_df:  265
search_df:  0
0
./2017/07/01/05/49.json.bz2
start search_df:  244
search_df:  0
0
./2017/07/01/05/48.json.bz2
start search_df:  263
search_df:  0
0
./2017/07/01/05/00.json.bz2
start search_df:  356
search_df:  0
0
./2017/07/01/05/01.json.bz2
start search_df:  236
search_df:  0
0
./2017/07/01/05/33.json.bz2
start search_df:  285
search_df:  0
0
./2017/07/01/05/32.json.bz2
start search_df:  226
search_df:  0
0
./2017/07/01/05/44.json.bz2
start search_df:  231
search_df:  0
0
./2017/07/01/05/45.json.bz2
start search_df:  311
search_df:  0
0
./2017/07/01/05/39.json.bz2
start search_df:  246
search_df:  0
0
./2017/07/01/05/38.json.bz2
start search_df:  230
search_df:  0
0
./2017/07/01/05/07.json.bz2
start sear

start search_df:  373
search_df:  0
1
./2017/07/01/20/48.json.bz2
start search_df:  355
search_df:  0
1
./2017/07/01/20/00.json.bz2
start search_df:  459
search_df:  0
1
./2017/07/01/20/01.json.bz2
start search_df:  329
search_df:  0
1
./2017/07/01/20/33.json.bz2
start search_df:  344
search_df:  0
1
./2017/07/01/20/32.json.bz2
start search_df:  322
search_df:  0
1
./2017/07/01/20/44.json.bz2
start search_df:  317
search_df:  0
1
./2017/07/01/20/45.json.bz2
start search_df:  391
search_df:  0
1
./2017/07/01/20/39.json.bz2
start search_df:  327
search_df:  0
1
./2017/07/01/20/38.json.bz2
start search_df:  327
search_df:  0
1
./2017/07/01/20/07.json.bz2
start search_df:  327
search_df:  0
1
./2017/07/01/20/06.json.bz2
start search_df:  319
search_df:  0
1
./2017/07/01/20/24.json.bz2
start search_df:  311
search_df:  0
1
./2017/07/01/20/25.json.bz2
start search_df:  310
search_df:  0
1
./2017/07/01/20/53.json.bz2
start search_df:  347
search_df:  0
1
./2017/07/01/20/52.json.bz2
start sear

start search_df:  398
search_df:  0
1
./2017/07/01/11/44.json.bz2
start search_df:  420
search_df:  0
1
./2017/07/01/11/45.json.bz2
start search_df:  434
search_df:  0
1
./2017/07/01/11/39.json.bz2
start search_df:  370
search_df:  0
1
./2017/07/01/11/38.json.bz2
start search_df:  392
search_df:  0
1
./2017/07/01/11/07.json.bz2
start search_df:  406
search_df:  0
1
./2017/07/01/11/06.json.bz2
start search_df:  367
search_df:  0
1
./2017/07/01/11/24.json.bz2
start search_df:  400
search_df:  0
1
./2017/07/01/11/25.json.bz2
start search_df:  410
search_df:  0
1
./2017/07/01/11/53.json.bz2
start search_df:  364
search_df:  0
1
./2017/07/01/11/52.json.bz2
start search_df:  371
search_df:  0
1
./2017/07/01/11/10.json.bz2
start search_df:  438
search_df:  0
1
./2017/07/01/11/11.json.bz2
start search_df:  369
search_df:  0
1
./2017/07/01/11/59.json.bz2
start search_df:  376
search_df:  0
1
./2017/07/01/11/58.json.bz2
start search_df:  340
search_df:  0
1
./2017/07/01/11/03.json.bz2
start sear

1
./2017/07/01/17/07.json.bz2
start search_df:  314
search_df:  0
1
./2017/07/01/17/06.json.bz2
start search_df:  343
search_df:  0
1
./2017/07/01/17/24.json.bz2
start search_df:  289
search_df:  0
1
./2017/07/01/17/25.json.bz2
start search_df:  324
search_df:  0
1
./2017/07/01/17/53.json.bz2
start search_df:  315
search_df:  0
1
./2017/07/01/17/52.json.bz2
start search_df:  315
search_df:  0
1
./2017/07/01/17/10.json.bz2
start search_df:  345
search_df:  0
1
./2017/07/01/17/11.json.bz2
start search_df:  348
search_df:  0
1
./2017/07/01/17/59.json.bz2
start search_df:  304
search_df:  0
1
./2017/07/01/17/58.json.bz2
start search_df:  327
search_df:  0
1
./2017/07/01/17/03.json.bz2
start search_df:  363
search_df:  0
1
./2017/07/01/17/02.json.bz2
start search_df:  341
search_df:  0
1
./2017/07/01/17/37.json.bz2
start search_df:  312
search_df:  0
1
./2017/07/01/17/36.json.bz2
start search_df:  312
search_df:  0
1
./2017/07/01/17/09.json.bz2
start search_df:  308
search_df:  0
1
./2017/0

start search_df:  345
search_df:  0
1
./2017/07/01/19/52.json.bz2
start search_df:  348
search_df:  0
1
./2017/07/01/19/10.json.bz2
start search_df:  318
search_df:  0
1
./2017/07/01/19/11.json.bz2
start search_df:  334
search_df:  0
1
./2017/07/01/19/59.json.bz2
start search_df:  328
search_df:  0
1
./2017/07/01/19/58.json.bz2
start search_df:  297
search_df:  0
1
./2017/07/01/19/03.json.bz2
start search_df:  343
search_df:  0
1
./2017/07/01/19/02.json.bz2
start search_df:  319
search_df:  0
1
./2017/07/01/19/37.json.bz2
start search_df:  303
search_df:  0
1
./2017/07/01/19/36.json.bz2
start search_df:  339
search_df:  0
1
./2017/07/01/19/09.json.bz2
start search_df:  328
search_df:  0
1
./2017/07/01/19/08.json.bz2
start search_df:  342
search_df:  0
1
./2017/07/01/19/40.json.bz2
start search_df:  361
search_df:  0
1
./2017/07/01/19/41.json.bz2
start search_df:  307
search_df:  0
1
./2017/07/01/19/14.json.bz2
start search_df:  315
search_df:  0
1
./2017/07/01/19/15.json.bz2
start sear

start search_df:  306
search_df:  0
2
./2017/07/01/07/03.json.bz2
start search_df:  313
search_df:  0
2
./2017/07/01/07/02.json.bz2
start search_df:  327
search_df:  0
2
./2017/07/01/07/37.json.bz2
start search_df:  341
search_df:  0
2
./2017/07/01/07/36.json.bz2
start search_df:  345
search_df:  0
2
./2017/07/01/07/09.json.bz2
start search_df:  335
search_df:  0
2
./2017/07/01/07/08.json.bz2
start search_df:  333
search_df:  0
2
./2017/07/01/07/40.json.bz2
start search_df:  368
search_df:  0
2
./2017/07/01/07/41.json.bz2
start search_df:  342
search_df:  0
2
./2017/07/01/07/14.json.bz2
start search_df:  311
search_df:  0
2
./2017/07/01/07/15.json.bz2
start search_df:  378
search_df:  0
2
./2017/07/01/07/20.json.bz2
start search_df:  333
search_df:  0
2
./2017/07/01/07/21.json.bz2
start search_df:  311
search_df:  0
2
./2017/07/01/07/57.json.bz2
start search_df:  326
search_df:  0
2
./2017/07/01/07/56.json.bz2
start search_df:  355
search_df:  0
2
./2017/07/01/07/13.json.bz2
start sear

2
./2017/07/01/08/17.json.bz2
start search_df:  404
search_df:  0
2
./2017/07/01/08/16.json.bz2
start search_df:  399
search_df:  0
2
./2017/07/01/08/34.json.bz2
start search_df:  370
search_df:  0
2
./2017/07/01/08/35.json.bz2
start search_df:  363
search_df:  0
2
./2017/07/01/08/43.json.bz2
start search_df:  359
search_df:  0
2
./2017/07/01/08/42.json.bz2
start search_df:  357
search_df:  0
2
./2017/07/01/08/49.json.bz2
start search_df:  346
search_df:  0
2
./2017/07/01/08/48.json.bz2
start search_df:  404
search_df:  0
2
./2017/07/01/08/00.json.bz2
start search_df:  547
search_df:  0
2
./2017/07/01/08/01.json.bz2
start search_df:  436
search_df:  0
2
./2017/07/01/08/33.json.bz2
start search_df:  379
search_df:  0
2
./2017/07/01/08/32.json.bz2
start search_df:  377
search_df:  0
2
./2017/07/01/08/44.json.bz2
start search_df:  343
search_df:  0
2
./2017/07/01/08/45.json.bz2
start search_df:  418
search_df:  0
2
./2017/07/01/08/39.json.bz2
start search_df:  365
search_df:  0
2
./2017/0

start search_df:  243
search_df:  0
3
./2017/07/01/06/42.json.bz2
start search_df:  276
search_df:  0
3
./2017/07/01/06/49.json.bz2
start search_df:  286
search_df:  0
3
./2017/07/01/06/48.json.bz2
start search_df:  283
search_df:  0
3
./2017/07/01/06/00.json.bz2
start search_df:  475
search_df:  0
3
./2017/07/01/06/01.json.bz2
start search_df:  332
search_df:  0
3
./2017/07/01/06/33.json.bz2
start search_df:  270
search_df:  0
3
./2017/07/01/06/32.json.bz2
start search_df:  279
search_df:  0
3
./2017/07/01/06/44.json.bz2
start search_df:  286
search_df:  0
3
./2017/07/01/06/45.json.bz2
start search_df:  313
search_df:  0
3
./2017/07/01/06/39.json.bz2
start search_df:  276
search_df:  0
3
./2017/07/01/06/38.json.bz2
start search_df:  281
search_df:  0
3
./2017/07/01/06/07.json.bz2
start search_df:  280
search_df:  0
3
./2017/07/01/06/06.json.bz2
start search_df:  286
search_df:  0
3
./2017/07/01/06/24.json.bz2
start search_df:  271
search_df:  0
3
./2017/07/01/06/25.json.bz2
start sear

start search_df:  399
search_df:  0
3
./2017/07/01/15/33.json.bz2
start search_df:  360
search_df:  0
3
./2017/07/01/15/32.json.bz2
start search_df:  359
search_df:  0
3
./2017/07/01/15/44.json.bz2
start search_df:  338
search_df:  0
3
./2017/07/01/15/45.json.bz2
start search_df:  385
search_df:  0
3
./2017/07/01/15/39.json.bz2
start search_df:  356
search_df:  0
3
./2017/07/01/15/38.json.bz2
start search_df:  335
search_df:  0
3
./2017/07/01/15/07.json.bz2
start search_df:  359
search_df:  0
3
./2017/07/01/15/06.json.bz2
start search_df:  375
search_df:  0
3
./2017/07/01/15/24.json.bz2
start search_df:  322
search_df:  0
3
./2017/07/01/15/25.json.bz2
start search_df:  376
search_df:  0
3
./2017/07/01/15/53.json.bz2
start search_df:  325
search_df:  0
3
./2017/07/01/15/52.json.bz2
start search_df:  313
search_df:  0
3
./2017/07/01/15/10.json.bz2
start search_df:  330
search_df:  0
3
./2017/07/01/15/11.json.bz2
start search_df:  346
search_df:  0
3
./2017/07/01/15/59.json.bz2
start sear

./2017/07/01/13/39.json.bz2
start search_df:  330
search_df:  0
3
./2017/07/01/13/38.json.bz2
start search_df:  414
search_df:  0
3
./2017/07/01/13/07.json.bz2
start search_df:  402
search_df:  0
3
./2017/07/01/13/06.json.bz2
start search_df:  362
search_df:  0
3
./2017/07/01/13/24.json.bz2
start search_df:  343
search_df:  0
3
./2017/07/01/13/25.json.bz2
start search_df:  380
search_df:  0
3
./2017/07/01/13/53.json.bz2
start search_df:  400
search_df:  0
3
./2017/07/01/13/52.json.bz2
start search_df:  344
search_df:  0
3
./2017/07/01/13/10.json.bz2
start search_df:  399
search_df:  0
3
./2017/07/01/13/11.json.bz2
start search_df:  354
search_df:  0
3
./2017/07/01/13/59.json.bz2
start search_df:  374
search_df:  0
3
./2017/07/01/13/58.json.bz2
start search_df:  347
search_df:  0
3
./2017/07/01/13/03.json.bz2
start search_df:  387
search_df:  0
3
./2017/07/01/13/02.json.bz2
start search_df:  378
search_df:  0
3
./2017/07/01/13/37.json.bz2
start search_df:  368
search_df:  0
3
./2017/07/

start search_df:  325
search_df:  0
3
./2017/07/01/22/25.json.bz2
start search_df:  295
search_df:  0
3
./2017/07/01/22/53.json.bz2
start search_df:  282
search_df:  0
3
./2017/07/01/22/52.json.bz2
start search_df:  283
search_df:  0
3
./2017/07/01/22/10.json.bz2
start search_df:  314
search_df:  0
3
./2017/07/01/22/11.json.bz2
start search_df:  342
search_df:  0
3
./2017/07/01/22/59.json.bz2
start search_df:  519
search_df:  0
3
./2017/07/01/22/58.json.bz2
start search_df:  502
search_df:  0
3
./2017/07/01/22/03.json.bz2
start search_df:  316
search_df:  0
3
./2017/07/01/22/02.json.bz2
start search_df:  321
search_df:  0
3
./2017/07/01/22/37.json.bz2
start search_df:  323
search_df:  0
3
./2017/07/01/22/36.json.bz2
start search_df:  304
search_df:  0
3
./2017/07/01/22/09.json.bz2
start search_df:  286
search_df:  0
3
./2017/07/01/22/08.json.bz2
start search_df:  351
search_df:  0
3
./2017/07/01/22/40.json.bz2
start search_df:  306
search_df:  0
3
./2017/07/01/22/41.json.bz2
start sear

In [14]:
print_full (totalResult['id'])

212   881,054,479,029,747,712.00
26    881,135,563,310,592,000.00
192   881,117,112,596,537,344.00
Name: id, dtype: float64


In [15]:
print_full( totalResult['extended_tweet.full_text'])

212    @LBC No way ! I will donate to the uk patreon;) get us out of the horrid EU cartel                                                          
26     Help Mission 22 earn up to $1,000 by liking this post!\n\nAs part of #AuchinachieCares, we'll donate $1 for every... https://t.co/PS98jkWfwX
192    I WILL DONATE MONEY TO GET UNDERSHIRTS FOR MONSTA X\n\none retweet=one dollar                                                               
Name: extended_tweet.full_text, dtype: object


In [16]:
totalResult.to_csv("fullArchive2017.csv", sep='\t')