In [1]:
from striprtf.striprtf import rtf_to_text
import json
import pandas as pd

### Convert the rft file into a JSON file

Data **before** Trump's first inauguration

In [2]:
# Step 1: Read RTF
with open('raw_data_before.rtf', 'r', encoding='utf-8') as file:
    rtf_content = file.read()

# Step 2: Convert to plain text
plain_text = rtf_to_text(rtf_content)

# Step 3: Load JSON
try:
    data_1 = json.loads(plain_text)
except json.JSONDecodeError as e:
    print("Failed to decode JSON:", e)
    data_1 = []

# Step 4: Save to JSON file
if data_1:
    with open('raw_data_before.json', 'w', encoding='utf-8', errors='replace') as f:
        json.dump(data_1, f, indent=4, ensure_ascii=False)
    print(f"Saved {len(data_1)} records into raw_data_before.json")
else:
    print("No data found.")

Saved 1730 records into raw_data_before.json


Data **after** Trump's first inauguration

In [3]:
# Step 1: Read RTF
with open('raw_data_after.rtf', 'r', encoding='utf-8') as file:
    rtf_content = file.read()

# Step 2: Convert to plain text
plain_text = rtf_to_text(rtf_content)

# Step 3: Load JSON
try:
    data_2 = json.loads(plain_text)
except json.JSONDecodeError as e:
    print("Failed to decode JSON:", e)
    data_2 = []

# Step 4: Save to JSON file
if data_2:
    with open('raw_data_after.json', 'w', encoding='utf-8', errors='replace') as f:
        json.dump(data_2, f, indent=4, ensure_ascii=False)
    print(f"Saved {len(data_2)} records into raw_data_after.json")
else:
    print("No data found.")

Saved 943 records into raw_data_after.json


### Data cleaning & merging

In [4]:
raw_data_1 = pd.read_json("raw_data_before.json")

In [5]:
raw_data_2 = pd.read_json("raw_data_after.json")

In [6]:
merged_raw_data = pd.concat([raw_data_1, raw_data_2], ignore_index=True)

In [7]:
sorted_raw_data = merged_raw_data.sort_values(by='date')

In [8]:
# Filter out rows where the 'date' is July 24, 2017 (CET) - only 2 posts in the dataset
filtered_raw_data = sorted_raw_data[sorted_raw_data['date'].dt.date != pd.to_datetime('2017-07-24').date()]

In [9]:
data = filtered_raw_data[['date', 'id', 'text']]

In [10]:
print(f"There are a total of {len(data)} posts in the downloaded dataset (without retweets).")

There are a total of 2671 posts in the downloaded dataset (without retweets).


### Systematic random sampling

In [11]:
interval = len(data) / 1200

In [12]:
sample_indexes = [int(i * interval) for i in range(1200)]

In [13]:
sample_indexes = [idx for idx in sample_indexes if idx < len(data)]

In [14]:
sampled_data = data.iloc[sample_indexes].reset_index(drop=True)

In [15]:
sampled_data.insert(0, 'ID', range(1, len(sampled_data) + 1))

In [16]:
sampled_data

Unnamed: 0,ID,date,id,text
0,1,2016-07-19 04:14:11,755254384062263296,"It was truly an honor to introduce my wife, Me..."
1,2,2016-07-19 19:34:40,755486029742641152,#MakeAmericaWorkAgain\n#TrumpPence16 #RNCinCLE...
2,3,2016-07-19 22:12:56,755525857456844800,The ROLL CALL is beginning at the Republican N...
3,4,2016-07-20 10:18:17,755708398537154560,"Congratulations to my children, Don and Tiffan..."
4,5,2016-07-20 12:51:58,755747074939949056,"In November, I think the people of Ohio will r..."
...,...,...,...,...
1195,1196,2017-07-22 11:47:53,888727293346156544,...What about all of the Clinton ties to Russi...
1196,1197,2017-07-22 12:17:54,888734848550043648,The Republican Senators must step up to the pl...
1197,1198,2017-07-22 14:43:18,888771440014749696,Join me live for the commissioning ceremony of...
1198,1199,2017-07-22 22:57:01,888895687865380864,"American steel &amp, American hands have const..."


### Prepare the Excel file

In [17]:
sampled_data = sampled_data[['text']].rename(columns={'text': 'Text'})

In [18]:
sampled_data.to_excel('sampled_1200_posts.xlsx', index=False)