In [1]:
from striprtf.striprtf import rtf_to_text
import json
import pandas as pd
import string
import random

### Convert the rft file into a JSON file

Data **before** Trump's first inauguration

In [2]:
# Step 1: Read RTF
with open('raw_data_before.rtf', 'r', encoding='utf-8') as file:
    rtf_content = file.read()

# Step 2: Convert to plain text
plain_text = rtf_to_text(rtf_content)

# Step 3: Load JSON
try:
    data_1 = json.loads(plain_text)
except json.JSONDecodeError as e:
    print("Failed to decode JSON:", e)
    data_1 = []

# Step 4: Save to JSON file
if data_1:
    with open('raw_data_before.json', 'w', encoding='utf-8', errors='replace') as f:
        json.dump(data_1, f, indent=4, ensure_ascii=False)
    print(f"Saved {len(data_1)} records into raw_data_before.json")
else:
    print("No data found.")

Saved 1730 records into raw_data_before.json


Data **after** Trump's first inauguration

In [3]:
# Step 1: Read RTF
with open('raw_data_after.rtf', 'r', encoding='utf-8') as file:
    rtf_content = file.read()

# Step 2: Convert to plain text
plain_text = rtf_to_text(rtf_content)

# Step 3: Load JSON
try:
    data_2 = json.loads(plain_text)
except json.JSONDecodeError as e:
    print("Failed to decode JSON:", e)
    data_2 = []

# Step 4: Save to JSON file
if data_2:
    with open('raw_data_after.json', 'w', encoding='utf-8', errors='replace') as f:
        json.dump(data_2, f, indent=4, ensure_ascii=False)
    print(f"Saved {len(data_2)} records into raw_data_after.json")
else:
    print("No data found.")

Saved 943 records into raw_data_after.json


### Data cleaning & merging

In [4]:
raw_data_1 = pd.read_json("raw_data_before.json")

In [5]:
raw_data_2 = pd.read_json("raw_data_after.json")

In [6]:
merged_raw_data = pd.concat([raw_data_1, raw_data_2], ignore_index=True)

In [7]:
sorted_raw_data = merged_raw_data.sort_values(by='date')

In [8]:
# Filter out rows where the 'date' is July 24, 2017 (CET) - only 2 posts in the dataset
filtered_raw_data = sorted_raw_data[sorted_raw_data['date'].dt.date != pd.to_datetime('2017-07-24').date()]

In [9]:
data = filtered_raw_data[['date', 'text']]

In [10]:
print(f"There are a total of {len(data)} posts in the downloaded dataset (without retweets).")

There are a total of 2671 posts in the downloaded dataset (without retweets).


### Systematic random sampling

In [11]:
interval = len(data) / 1200

In [12]:
sample_indexes = [int(i * interval) for i in range(1200)]

In [13]:
sample_indexes = [idx for idx in sample_indexes if idx < len(data)]

In [14]:
sampled_data = data.iloc[sample_indexes].reset_index(drop=True)

### Pseudonymization

In [15]:
# Function to generate random pseudonym
def generate_pseudonym():
    return ''.join(random.choices(string.ascii_uppercase + string.digits, k=6))

# Create pseudonym list
unique_ids = set()
while len(unique_ids) < len(sampled_data):
    unique_ids.add(generate_pseudonym())

pseudonyms = list(unique_ids)

In [16]:
sampled_data['pseudID'] = pseudonyms

In [17]:
sampled_data.to_excel('1200_sampled_data.xlsx', index=False)

### Prepare the Excel files for each coder

In [18]:
coders = ['Luke', 'Ada', 'Mare', 'Francesco']

In [19]:
sampled_data['coder'] = sampled_data.index.map(lambda x: coders[x % 4]) # assign coder based on row number

In [20]:
sampled_data['postID'] = range(1, len(sampled_data) + 1) # post ID starts from 1

In [21]:
for coder in coders:
    # Get the subset for the coder
    coder_data = sampled_data[sampled_data['coder'] == coder][['postID', 'text', 'pseudID']].copy()

    # Shuffle the rows
    coder_data = coder_data.sample(frac=1, random_state=42).reset_index(drop=True)
    
    # Extract initial letter from coder's name
    initial = coder[0].upper()  # 'Luke' -> 'L', 'Ada' -> 'A', etc.
    
    # Re-assign postID as 1L, 2L, 3L, etc.
    coder_data['postID'] = [f"{i+1}{initial}" for i in range(len(coder_data))]
    
    # Create an Excel file for each coder
    filename = f"{coder.replace(' ', '_')}_posts.xlsx"
    coder_data.to_excel(filename, index=False)