# Curating TweetJSON with Python

### Packages

In [1]:
import json
from tkinter import filedialog as fd
import codecs
import io

### Step 1: Simplify TweetJSON

The <code>simplify()</code> function contains <code>remove()</code> and <code>qtd_twts()</code>. The first of the two deletes unnecessary elements from each tweet, then writes the simpler JSON into a new file with a suffix <code>_simple.jsonl</code>, and the second writes all cited tweets (tweet objects) into a new file (<code>_quoted.jsonl</code>) that can be reprocessed with <code>simplify()</code> until <code>qtd_twts()</code> returns an empty .jsonl file.

#### Open input file

In [None]:
#Calls a function to pop up a GUI window to select a directory and input file. 
# This function (fd.askopenfilename()) is called inside codecs.open, with an encoding argument to maintain utf-8.
inf = fd.askopenfilename(title='Select a *.JSON or *.JSONL (Tweet JSON) file to simplify')

In [None]:
def simplify(f):
    remove(f)
    qtd_twts(f)
    return

In [None]:
def remove(file):
# Prompts user to enter an output file name, which is saved in the current directory.
    out = input("How should I call the simplified version of this file? ([filename]_simple.jsonl): ")
    # Creates lists for the superfluous Tweet JSON elements in each object:
    t_del = ['id','truncated','in_reply_to_status_id','in_reply_to_user_id','quoted_status_id','extended_entities','favorited','retweeted','possibly_sensitive','filter_level','matching_rules','current_user_retweet','scopes','withheld_copyright','withheld_in_countries','withheld_scope','geo']
    u_del = ['id','name','derived','url','protected','profile_banner_url','profile_image_url_https','default_profile','default_profile_image','withheld_in_countries','withheld_scope','utc_offset','time_zone','lang','geo_enabled','following','follow_request_sent','has_extended_profile','notifications','profile_location','contributors_enabled','profile_image_url','profile_background_color','profile_background_image_url','profile_background_image_url_https','profile_background_tile','profile_link_color','profile_sidebar_border_color','profile_sidebar_fill_color','profile_text_color','profile_use_background_image','is_translator','is_translation_enabled','translator_type']
    e_del = ['media','url','urls','symbols','polls','description']
    u_e_del = ['description', 'media','url','urls']
    with codecs.open(file, encoding='utf-8') as f:
        #Opens the output file with a suffix (simple) and .JSON filetype, in mode "append". 
        #New output file will be created if it does not already exist (+), and any lines will be added to the end.
        with io.open(out+"_simple.jsonl","a+",encoding="utf-8") as o:
            #A variable 'counter' will keep track of how many tweets have been read and written. Function prints counter every 1000th time to give me a sense of how/whether progress is made.
            counter = 0
            # A for-loop iterates over all lines in input file, counting each line and <code>json.loads()</code> decodes it into a Python dictionary:
            for line in f:
                counter = counter + 1
                # The text string in each line is decoded from JSON into a Python dictionary. 
                #Four nested for-loops (one for each set of deletions) remove any elements from the deletion lists.             
                l = json.loads(line, encoding='utf-8')
                for i in t_del:
                    if i in l:
                        del l[i]
                for u in u_del:
                    if u in l['user']:
                        del l['user'][u]
                for e in e_del:
                    if e in l['entities']:
                        del l['entities'][e]
                for ue in u_e_del:
                    if ue in l['user']['entities']:
                        del l['user']['entities'][ue]
                # Then, the modified line is reencoded in JSON (allowing for non-ascii characters). The JSON line is written into the output file, followed by a line break (otherwise tweets are printed all in one line, which is not good):
                linea = json.dumps(l, ensure_ascii=False)
                o.write(linea+'\n')
                if counter % 1000 == 0:# Every 1000 lines, the console prints how many tweets have been read and written.
                    print(str(counter))
    return# The function returns 'None'.

In [None]:
def qtd_twts(file):
    out = input("How should I call the output file containing QUOTED TWEETS? ([filename]_quoted.jsonl) ")
    with codecs.open(file, encoding='utf-8') as f:
        with io.open(out+"_quoted.jsonl","a+",encoding="utf-8") as o:
            counter = 0
            for line in f:
                counter = counter + 1
                l = json.loads(line, encoding='utf-8')
                if 'quoted_status' in l:
                    q = l['quoted_status']
                    t = l['id_str']
                    linea = json.dumps(q, ensure_ascii=False)
                    lineb = json.dumps(t, ensure_ascii=False)
                    # Two lines are saved as one JSON object, retaining the quoting tweet's ID.
                    o.write('{{"id_str":'+lineb+'},'+linea+'}\n')
                if counter % 1000 == 0:
                    print(str(counter))
    return

In [None]:
simplify(inf)# This calls the main function that calls the two sub-functions.

## Step 2: Create Separate Files for Each Object Type

In [None]:
file_in = fd.askopenfilename(title='Select a *_simple.JSONL file as input')
def objects(infile):
    objtweets(infile)
    objusers(infile)
    objmentions(infile)
    objhashtags(infile)
    return
def objtweets(inf):
    out = input("How should we call the output file containing TWEETS ([filename]_tweets.jsonl): ")
    with codecs.open(inf, encoding='utf-8') as f:
        with io.open(out+"_tweets.jsonl","a+",encoding="utf-8") as o:
            counter = 0
            tw_del = ['user','entities','quoted_status']
            for line in f:
                counter = counter + 1
                l = json.loads(line, encoding='utf-8')
                for d in tw_del:
                    del l[d]
                    linea = json.dumps(l, ensure_ascii=False)
                    o.write(linea+'\n')
                if counter % 1000 == 0:
                        print(str(counter))
    return
def objusers(inf):
    out = input("How should we call the output file containing USERS ([filename]_users.jsonl): ")
    with codecs.open(inf, encoding='utf-8') as f:
        with io.open(out+"_users.jsonl","a+",encoding="utf-8") as o:
            counter = 0
            uid = []
            for line in f:
                counter = counter + 1
                l = json.loads(line, encoding='utf-8')
                u = l['user']['id_str']
                if u not in uid:
                    uid.append(l['user']['id_str'])
                    usr = l['user']
                    linea = json.dumps(usr, ensure_ascii=False)
                    o.write(linea+'\n')
                if counter % 1000 == 0:
                        print(str(counter))
    return
def objmentions(file):
    out = input("How should I call the MENTIONS file? ([filename]_mentions.jsonl): ")
    with codecs.open(file, encoding='utf-8') as f:
        with io.open(out+"_mentions.jsonl","a+",encoding="utf-8") as o:
            counter = 0
            for line in f:
                counter = counter + 1
                l = json.loads(line, encoding='utf-8')
                m = l['entities']['user_mentions']
                t = l['id_str']
                linea = json.dumps(m, ensure_ascii=False)
                lineb = json.dumps(t, ensure_ascii=False)
                o.write('{{"id_str":'+lineb+'},'+linea+'}\n')
                if counter % 1000 == 0:
                    print(str(counter))
    return
def objhashtags(file):
    out = input("How should I call the HASHTAGS file? ([filename]_hashtags.jsonl) ")
    with codecs.open(file, encoding='utf-8') as f:
        with io.open(out+"_hashtags.jsonl","a+",encoding="utf-8") as o:
            counter = 0
            for line in f:
                counter = counter + 1
                l = json.loads(line, encoding='utf-8')
                h = l['entities']['hashtags']
                t = l['id_str']
                linea = json.dumps(h, ensure_ascii=False)
                lineb = json.dumps(t, ensure_ascii=False)
                o.write('{{"id_str":'+lineb+'},'+linea+'}\n')
                if counter % 1000 == 0:
                    print(str(counter))
    return
objects(file_in)

## Step 3: Merge and Deduplicate Files Containing the Same Object Types

In [None]:
hashtags = fd.askopenfilename(title='Select the 1st (of 3) JSON file to concatenate')
tweets_at = fd.askopenfilename(title='Select the 2nd (of 3) JSON file to concatenate')
tweets_from = fd.askopenfilename(title='Select the third (of 3) JSON file to concatenate')

filenames = [hashtags,tweets_at,tweets_from]

def one_file(files):
    out = input("How should I call the file to unite them all? ([filename]_onefile.jsonl) ")
    with codecs.open(out+'_onefile.jsonl', 'a+', encoding='utf-8') as f:
        for name in files:
            with codecs.open(name,'r', encoding='utf-8') as infile:
                for line in infile:
                    f.write(line)

one_file(filenames)

In [None]:
dupefile = fd.askopenfilename(title='Select a JSONL file that may have duplicate lines')

def dedupe(dupe):
    out = input("How should I call the DEDUPED file? ([filename]_deduped.jsonl): ")
    with codecs.open(out+'_deduped.jsonl', 'a+', encoding='utf-8') as o:
        with codecs.open(dupe,'r', encoding='utf-8') as d:
            seen = set()
            for l in d:
                if l not in seen:
                    o.write(l)
                    seen.add(l)
            o.close()
    return
dedupe(dupefile)