In [1]:
from striprtf.striprtf import rtf_to_text
import json
import pandas as pd

### Convert the rft file into a JSON file

In [2]:
# Step 1: Read RTF
with open('raw_data.rtf', 'r', encoding='utf-8') as file:
    rtf_content = file.read()

# Step 2: Convert to plain text
plain_text = rtf_to_text(rtf_content)

# Step 3: Load JSON
try:
    data = json.loads(plain_text)
except json.JSONDecodeError as e:
    print("Failed to decode JSON:", e)
    data = []

# Step 4: Save to JSON file
if data:
    with open('raw_data.json', 'w', encoding='utf-8', errors='replace') as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print(f"Saved {len(data)} records into raw_data.json")
else:
    print("No data found.")

Saved 1723 records into raw_data.json


### Data cleaning

In [3]:
raw_data = pd.read_json("raw_data.json")

In [4]:
data_sorted = raw_data.sort_values(by='date')

In [5]:
data = data_sorted[['date', 'id', 'text']]

In [6]:
print(f"There are a total of {len(data)} posts in the downloaded dataset (without retweets).")

There are a total of 1723 posts in the downloaded dataset (without retweets).


### Systematic random sampling

In [7]:
interval = len(data) / 1200  # 1723 / 1200 ≈ 1.4358

In [8]:
sample_indexes = [int(i * interval) for i in range(1200)]

In [9]:
sample_indexes = [idx for idx in sample_indexes if idx < len(data)]

In [10]:
sampled_data = data.iloc[sample_indexes].reset_index(drop=True)

In [11]:
sampled_data

Unnamed: 0,date,id,text
0,2016-07-19 04:14:11,755254384062263296,"It was truly an honor to introduce my wife, Me..."
1,2016-07-19 13:36:43,755395950089211904,"""@RoxaneTancredi: Democrats are coming to TRU..."
2,2016-07-19 19:34:40,755486029742641152,#MakeAmericaWorkAgain\n#TrumpPence16 #RNCinCLE...
3,2016-07-19 22:12:56,755525857456844800,The ROLL CALL is beginning at the Republican N...
4,2016-07-19 23:52:59,755551039244341248,Such a great honor to be the Republican Nomine...
...,...,...,...
1195,2017-01-18 12:44:03,821699672687448064,"to the U.S., but had nothing to do with TRUMP,..."
1196,2017-01-18 13:06:58,821705440178348032,No wonder the Today Show on biased @NBC is doi...
1197,2017-01-18 14:03:53,821719763214880768,.@TheAlabamaBand was great last night in D.C. ...
1198,2017-01-18 22:21:02,821844875268255744,Looking forward to a speedy recovery for Georg...


### Prepare the Excel file

In [12]:
sampled_data = sampled_data[['text']].rename(columns={'text': 'Text'})

In [13]:
sampled_data.insert(0, 'ID', range(1, len(sampled_data) + 1))

In [14]:
sampled_data.to_excel('sampled_1200_posts.xlsx', index=False)