# Raw JSON Data to Time Series DataFrame
#### Load Json Data


In [2]:
# Imports
from datetime import datetime
import json
import pandas as pd
import re

In [2]:
# Load JSON Data and merge posts into one list of dicts
json_data = list()
for i in range(4):
    with open(f'.\..\..\Data\Pushshift_IO\Take_2\wsb_data_{i+1}.json') as d:
        json_data += json.load(d)['data']

#### Create Data From Data

In [3]:
df = pd.DataFrame.from_dict(json_data)

In [4]:
df.head()

Unnamed: 0,title,created_utc
0,Fed funds 4.6% from Summary of Economics Proje...,1666265085
1,Me with my TSLA 10/21 220 C I held through ear...,1666265075
2,A new tactic from bagholders - Set up a gofund...,1666264841
3,Fed funds 4.6% from Summary of Economics Proje...,1666264781
4,Can someone give me some knowledge on why 366 ...,1666264287


In [5]:
start_date = datetime.fromtimestamp(df.created_utc.min())
end_date = datetime.fromtimestamp(df.created_utc.max())
print(f"The dataset is from {start_date} to {end_date} and has {df.shape[0]} datapoints.")

The dataset is from 2020-12-01 06:41:14 to 2022-10-20 13:24:45 and has 1459699 datapoints.


#### Create Time Series

In [6]:
# Puts datapoints into daily bins
df['date'] = df['created_utc'].map(
    lambda timestamp: datetime.fromtimestamp(timestamp).date()
)

In [7]:
# Aggregate all Posts form the same day (string concat)
aggregation_dict = {'title':'sum'}
df = df.groupby(df['date']).aggregate(aggregation_dict)

#### Text Processing

In [8]:
target_words = ['GME', 'GameStop', 'Game Stop', 'gamestop', 'game stop']

In [9]:
# Compute accumulated target word frequency as feature
df['word_count'] = df['title'].map(
    lambda title: sum([title.count(word) for word in target_words])
)

#### Save Data

In [9]:
df.head()

Unnamed: 0,date,title,word_count
0,2020-12-01,Diamond hands on Chinese EVsSomeone give me a ...,90
1,2020-12-02,Ford v TeslaTesla shares rise as Goldman initi...,64
2,2020-12-03,You like Palantir? You'll love NSH!Analysis of...,27
3,2020-12-04,Who’s gonna be the Tesla of cannabis?CL 10 – D...,47
4,2020-12-05,Most anticipated earnings of the week (fixed i...,31


In [5]:
df['word_count'].to_csv(f'./../../Data/Time_Series/timeseries_2.csv')