In [1]:
import json
def dump_jsonl(data, output_path, append=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))

def load_jsonl(input_path) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line != '\n':
                data.append(json.loads(line.rstrip('\n')))
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

In [2]:
fname = r"C:\Users\gbao\Google 云端硬盘\BUDT758X\Final project\stream__realDonaldTrump.jsonl"
webpage_data = load_jsonl(fname)

Loaded 138859 records from C:\Users\gbao\Google 云端硬盘\BUDT758X\Final project\stream__realDonaldTrump.jsonl


In [4]:
import pandas as pd
db_data = []
# first level keys
l1_cols = ['id','created_at','text','source','geo','coordinates',
           'is_quote_status','quoted_status','quoted_status_id',
           'in_reply_to_status_id','in_reply_to_user_id',
           'quote_count','reply_count','retweet_count','favorite_count']
# keys under user
us_cols = ['id','screen_name','created_at','friends_count','follow_count','verified']
# keys under entities
en_cols = ['hashtags']
for d in webpage_data:
    db_data.append([])
    for i in l1_cols:
        db_data[-1].append(d.get(i, float('nan')))
    temp = d.get('user',True)
    for i in us_cols:
        db_data[-1].append(temp.get(i, float('nan')))
    temp = d.get('entities',float('nan'))
    for i in en_cols:
        db_data[-1].append(temp.get(i, float('nan')))
columns = l1_cols+us_cols+en_cols
df = pd.DataFrame(db_data, columns=columns)

In [5]:
df.head()

Unnamed: 0,id,created_at,text,source,geo,coordinates,is_quote_status,quoted_status,quoted_status_id,in_reply_to_status_id,...,reply_count,retweet_count,favorite_count,id.1,screen_name,created_at.1,friends_count,follow_count,verified,hashtags
0,1255068528849215489,Tue Apr 28 09:37:09 +0000 2020,@realDonaldTrump why should the American peopl...,"<a href=""http://twitter.com/download/iphone"" r...",,,False,,,1.254783e+18,...,0,0,0,938787819580923904,505abq503pdx,Thu Dec 07 15:10:32 +0000 2017,115,,False,[]
1,1255068533555212288,Tue Apr 28 09:37:11 +0000 2020,@realDonaldTrump @GOP @SpeakerPelosi \n@FoxNe...,"<a href=""http://twitter.com/download/android"" ...",,,False,,,,...,0,0,0,1206767177841295360,Levelhead14,Tue Dec 17 02:45:08 +0000 2019,97,,False,[]
2,1255068535551737859,Tue Apr 28 09:37:11 +0000 2020,RT @realDonaldTrump: Why should the people and...,"<a href=""http://twitter.com/download/android"" ...",,,False,,,,...,0,0,0,1243960919618498562,pYx0eiPwAJeQpWF,Sat Mar 28 17:59:49 +0000 2020,23,,False,[]
3,1255068537372246019,Tue Apr 28 09:37:12 +0000 2020,"RT @realDonaldTrump: FAKE NEWS, THE ENEMY OF T...","<a href=""http://twitter.com/download/iphone"" r...",,,False,,,,...,0,0,0,553805586,MassiveMatthew,Sat Apr 14 18:04:33 +0000 2012,1416,,False,[]
4,1255068537850335232,Tue Apr 28 09:37:12 +0000 2020,@realDonaldTrump Old dirty Donnie will be taki...,"<a href=""http://twitter.com/download/android"" ...",,,False,,,1.254967e+18,...,0,0,0,1219699627806613504,Timothy70078813,Tue Jan 21 19:15:07 +0000 2020,1,,False,[]


In [6]:
df.columns

Index(['id', 'created_at', 'text', 'source', 'geo', 'coordinates',
       'is_quote_status', 'quoted_status', 'quoted_status_id',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'quote_count',
       'reply_count', 'retweet_count', 'favorite_count', 'id', 'screen_name',
       'created_at', 'friends_count', 'follow_count', 'verified', 'hashtags'],
      dtype='object')