In [1]:
import json
import os

path = './SP-500-Twitter/'
json_files = [f for f in os.listdir(path) if f.endswith('.json')]
print(json_files)
print(len(json_files))

['CSX.json', 'ADI_News.json', 'AMETEKInc.json', 'CabotOG.json', 'grainger.json', 'dish.json', 'cardinalhealth.json', 'digitalrealty.json', 'BNYMellon.json', 'HelmerichPayne.json', 'citrix.json', 'Ford.json', 'Garmin.json', 'Ecolab.json', 'HiltonHotels.json', 'Gap.json', 'HersheyCompany.json', '3M.json', 'BDandCo.json', 'Centene.json', 'FBHS_News.json', 'CapitalOne.json', 'Hologic.json', 'bakerhughesco.json', 'bmsnews.json', 'FastenalCompany.json', 'CrownCastle.json', 'BestBuy.json', 'eBay.json', 'Fiserv.json', 'Accenture.json', 'GlobeLife.json', 'comcast.json', 'extraspace.json', 'CampbellSoupCo.json', 'Chevron.json', 'Fortinet.json', 'BorgWarner.json', 'HenrySchein.json', 'AmericanAir.json', 'GM.json', 'bostonsci.json', 'Boeing.json', 'Citi.json', 'GoldmanSachs.json', 'CFGCommercial.json', 'DRHorton.json', 'DXCTechnology.json', 'AristaNetworks.json', 'CDWCorp.json', 'eatoncorp.json', 'blackrock.json', 'DentsplySirona.json', 'BankofAmerica.json', 'EdwardsLifesci.json', 'CBRE.json', 'Co

## Sample data

In [6]:
data = []
with open(path + 'eBay.json') as f:
    for line in f:
        data.append(json.loads(line))
print("total twitts of %s: %d" %(json_files[0], len(data)))

total twitts of CSX.json: 3244


In [11]:
top20 = [i['full_text'] for i in sorted(data, key=lambda x: (x['favorite_count'],x['retweet_count']), reverse=True)]
for i in top20[:20]:
    print(i)

Help sellers like Marcel support the end of childhood cancer. When you like, reply to, or RT this post, we'll donate $1 to @StJude, up to $10k total. https://t.co/XAOcNd2QpD
“They say superheroes never die.” Today we lost one. RIP Stan Lee. https://t.co/thYgCb8Pg9
We've got two exclusive signed guitars by @Harry_Styles and @NiallOfficial up for grabs, including an iconic piece from the Gibson custom shop. #GRAMMYs https://t.co/ykCQSjIVH3 https://t.co/6iuVYFsfOw
Bid on this exclusive hooded stuffed gorilla signed by @TheEllenShow and @portiaderossi! 100% of funds go toward @TheEllenFund, an org that benefits global conservation efforts for endangered species. 🦍💚 

Bid now: https://t.co/xHLHEPWbZN https://t.co/wqWtcSGRdu
Last chance to bid to meet @StephenCurry30 and own signed @warriors memorabila. 100% to @GSWfoundation: https://t.co/x48bj5LPGC https://t.co/s6uGgqj6P0
Get your claws on an exclusive variant cover of @Marvel’s new #BlackPanther comic book on eBay. Designed by @sanfordgre

## parse data function

In [55]:
import numpy as np
import pandas as pd
from typing import TypeVar, List, Any
T = TypeVar('T', str, List[str])

def parse(line: dict, fields: List[T]=['created_at', 'full_text', 'lang', ['user', 'name']]) -> List[Any]:
    """
    Parse one twitter data.
    -------------
    Input:
        line(dict): dictionary of that twitter
        fields(list): a list of fields to extract. If it's a first-level field, 
      input str, if it's a nested field, input list of strs.
      Default fields = ['created_at', 'full_text', 'lang', ['user', 'name']]
    -------------
    Return:
        a list of values of extracted fields.
    """
    ret = []
    for key in fields:
        if type(key) == str:
            ret.append(line[key])
        else:
            value = line
            for subkey in key:
                value = value[subkey]
            ret.append(value)
    return ret

def json2np(json_file: str, fields: List[T]=['created_at', 'full_text', 'lang', ['user', 'name']]) -> np.ndarray:
    """
    Parse json file to np.ndarray.
    --------------
    Input:
        json_file(str): path of input json file
        fields(list): fields(list): a list of fields to extract. 
      If it's a first-level field, input str, if it's a nested field, input list of strs.
      Default fields = ['created_at', 'full_text', 'lang', ['user', 'name']]
    --------------
    Return:
        An NxM matrix, where N = lines of json file, M = number of fields
    """
    data = []
    with open(json_file) as f:
        for line in f:
            data.append(json.loads(line))
    return np.array([parse(line, fields) for line in data])

def json2pd(json_file: str, fields: List[T]=['created_at', 'full_text', 'lang', ['user', 'name']]) -> pd.DataFrame:
    """
    Parse json file to pd.DataFrame.
    --------------
    Input:
        json_file(str): path of input json file
        fields(list): fields(list): a list of fields to extract. 
      If it's a first-level field, input str, if it's a nested field, input list of strs.
      Default fields = ['created_at', 'full_text', 'lang', ['user', 'name']]
    --------------
    Return:
        An NxM matrix, where N = lines of json file, M = number of fields
    """
    data = []
    with open(json_file) as f:
        for line in f:
            data.append(json.loads(line))
    df = pd.DataFrame([parse(line, fields) for line in data])
    col_names = []
    for name in fields:
        if type(name) == str:
            col_names.append(name)
        else:
            col_names.append('_'.join(name))
    df.columns = [col_names]
    return df

def json2csv(json_file: str, fields: List[T]=['created_at', 'full_text', 'lang', ['user', 'name']], csv_file: str=None):
    """
    Parse json file and write parsed data to csv file.
    --------------
    Input:
        json_file(str): path of input json file
        fields(list): fields(list): a list of fields to extract. 
      If it's a first-level field, input str, if it's a nested field, input list of strs.
      Default fields = ['created_at', 'full_text', 'lang', ['user', 'name']].
        csv_file(str): path of output csv fiel. 
      Default would be the path of json file changing '.json' to '.csv'
    """
    if csv_file == None:
        csv_file = json_file[:-5] + '.csv'
    df = json2pd(json_file, fields)
    df.to_csv(csv_file, index=False)
    return

## Test parsing functions

In [52]:
parse_data = json2np(path+json_files[0])
parse_data

array([['Fri Dec 20 00:00:46 +0000 2019',
        'CSX CEO Jim Foote joined Governor Ralph Northam today to announce a new landmark agreement to expand reliability and service along Virginia’s rail lines, creating a pathway to separate passenger and freight operations along the Richmond to D.C. corridor. https://t.co/TZo3CGG12k https://t.co/phvUrJCnQX',
        'en', 'CSX'],
       ['Thu Dec 19 21:48:16 +0000 2019',
        "CSX announces the appointment of retired Maj. Gen. Suzanne Vautrinot to the company’s board of directors. Bringing exceptional cybersecurity expertise and leadership skills, Vautrinot will serve on the company's Audit Committee and Governance Committee. \nhttps://t.co/Iq4zJSGjyY https://t.co/V3qsv3s3a1",
        'en', 'CSX'],
       ['Thu Dec 19 01:33:33 +0000 2019',
        'CSX participated in the groundbreaking of the rail spur segment at the @nfmipark. Located on a CSX Select Site, the park benefits from our offering of development-ready property. We’re proud t

In [53]:
parse_data = json2pd(path+json_files[0])
parse_data

Unnamed: 0,created_at,full_text,lang,user_name
0,Fri Dec 20 00:00:46 +0000 2019,CSX CEO Jim Foote joined Governor Ralph Northa...,en,CSX
1,Thu Dec 19 21:48:16 +0000 2019,CSX announces the appointment of retired Maj. ...,en,CSX
2,Thu Dec 19 01:33:33 +0000 2019,CSX participated in the groundbreaking of the ...,en,CSX
3,Tue Dec 17 22:27:34 +0000 2019,Automakers trust CSX to get vehicles to market...,en,CSX
4,Mon Dec 16 20:42:18 +0000 2019,CSX announces the appointment of Jeffery D. Wa...,en,CSX
...,...,...,...,...
3229,Sat Sep 20 14:55:46 +0000 2014,RT @uk1fan: Up early getting ready for the #Bi...,en,CSX
3230,Sat Sep 20 14:54:24 +0000 2014,RT @bomalley: Good luck to #TeamCSX today as t...,en,CSX
3231,Sat Sep 20 14:53:41 +0000 2014,#TeamCSX is just making it through Daytona Bea...,en,CSX
3232,Sat Sep 20 14:05:20 +0000 2014,Who's ready to plant some trees in the Windy C...,en,CSX


## Parse all json files to csv

In [58]:
json_path = './SP-500-Twitter/'
json_files = [f for f in os.listdir(json_path) if f.endswith('.json')]
csv_path = './SP-500-Twitter-csv/'
if not os.path.exists(csv_path):
    os.makedirs(csv_path)
for json_file in json_files:
    json2csv(json_path+json_file, csv_file=csv_path+json_file[:-5]+'.csv')