## 1. Govinfo CPD API Wrapper

In [3]:
import requests
import time
from bs4 import BeautifulSoup
import json
import pdfplumber
import io

In [7]:
'''
Get CPD package-level results from govinfo API
'''

API_KEY = "RUmC1dWEeWmCEJR0vWuhIxWdhNeojampNrUL4B1f"  
BASE_URL = "https://api.govinfo.gov"

def search_cpd_packages(
    start_date="2009-01-20",# When Obama took office
    end_date="2025-11-30",
    page_size=1000
):
    """
    return a list of CPD package-level results from govinfo API, each item is a dict.
    """
    search_url = f"{BASE_URL}/search"

    query = f"collection:cpd publishdate:range({start_date},{end_date})"

    offset_mark = "*" 
    all_results = []

    while True:
        payload = {
            "query": query,
            "pageSize": str(page_size),  
            "offsetMark": offset_mark,
            "sorts": [
                {
                    "field": "publishdate",  
                    "sortOrder": "ASC"      
                }
            ],
          
            "resultLevel": "package"        
        }

        # historical=true :contentReference[oaicite:7]{index=7}
        params = {"api_key": API_KEY, "historical": "true"}

        resp = requests.post(search_url, params=params, json=payload)
        resp.raise_for_status()
        data = resp.json()

        results = data.get("results", [])
        if not results:
            break

        for item in results:
            all_results.append({
                "packageId": item.get("packageId"),
                "title": item.get("title"),
                "dateIssued": item.get("dateIssued"),
                "collectionCode": item.get("collectionCode"),
                "download": item.get("download", {}),
                "resultLink": item.get("resultLink"),  
            })


        new_offset = data.get("offsetMark")
     
        if not new_offset or new_offset == offset_mark:
            break

        offset_mark = new_offset

        time.sleep(0.2)

    return all_results

In [8]:
cpd_results = search_cpd_packages()

with open("cpd_results.txt", "w", encoding="utf-8") as f:
    for item in cpd_results:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")


print(len(cpd_results),"got.")
print(cpd_results[0])

16929 got.
{'packageId': 'DCPD-200900002', 'title': 'Proclamation 8343-National Day of Renewal and Reconciliation, 2009', 'dateIssued': '2009-01-20', 'collectionCode': 'CPD', 'download': {'premisLink': 'https://api.govinfo.gov/packages/DCPD-200900002/premis', 'txtLink': 'https://api.govinfo.gov/packages/DCPD-200900002/htm', 'zipLink': 'https://api.govinfo.gov/packages/DCPD-200900002/zip', 'modsLink': 'https://api.govinfo.gov/packages/DCPD-200900002/mods', 'pdfLink': 'https://api.govinfo.gov/packages/DCPD-200900002/pdf'}, 'resultLink': 'https://api.govinfo.gov/packages/DCPD-200900002/summary'}


In [14]:
headers = {
    "User-Agent": (
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/118.0.0.0 Safari/537.36"
    )
}

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    main = soup.find("main") or soup
    return main.get_text("\n", strip=True)

def pdf_to_text(pdf_bytes: bytes) -> str:
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        pages = [page.extract_text() or "" for page in pdf.pages]
    return "\n\n".join(pages)

def fetch_text_from_item(item: dict) -> str:
    """
    Use the 'download' item from the results:
    1. If there is a txtLink → download HTML → convert to text
    2. Else if there is a pdfLink → download PDF → convert to text
    3. If neither exists, throw an error and skip
    """
    dl = item.get("download", {}) or {}
    txt_link = dl.get("txtLink")
    pdf_link = dl.get("pdfLink")

    if txt_link:
        resp = requests.get(
            txt_link,
            headers=headers,
            params={"api_key": API_KEY},
            timeout=60
        )
        resp.raise_for_status()
        return html_to_text(resp.text)
    
    if pdf_link:
        resp = requests.get(
            pdf_link,
            headers=headers,
            params={"api_key": API_KEY},  
            timeout=120
        )
        resp.raise_for_status()
        text = pdf_to_text(resp.content)
        return text

    raise ValueError(f"No txtLink/pdfLink for packageId={item.get('packageId')}")


def build_cpd_corpus(cpd_results, max_docs):
    """
    max_docs: None menas no limitation. 
    Need subset to test if the code works.
    """
    corpus = []
    total = len(cpd_results) if max_docs is None else min(max_docs, len(cpd_results))

    for i, item in enumerate(cpd_results[:total], start=1):
        pid = item["packageId"]
        print(f"[{i}/{total}] {pid}")

        try:
            text = fetch_text_from_item(item)
        except Exception as e:
            print(f"[WARN] {pid} skipped: {e}")
            continue

        corpus.append({
            "packageId": pid,
            "title": item.get("title"),
            "dateIssued": item.get("dateIssued"),
            "text": text
        })


        time.sleep(0.1) # 0.2 takes too long, 0.1 should be ok.

    return corpus



In [17]:
corpus_results = build_cpd_corpus(cpd_results, max_docs=None)

print("Got docs:", len(corpus_results))

with open("corpus_results.txt", "w", encoding="utf-8") as out:
    for doc in corpus_results:
        out.write(json.dumps({
            "packageId": doc.get("packageId"),
            "title": doc.get("title"),
            "dateIssued": doc.get("dateIssued"),
            "text": doc.get("text")
        }, ensure_ascii=False) + "\n")

[1/16929] DCPD-200900002
[2/16929] DCPD-200900001
[3/16929] DCPD-200900054
[4/16929] DCPD-200900012
[5/16929] DCPD-200900010
[6/16929] DCPD-200900009
[7/16929] DCPD-200900008
[8/16929] DCPD-200900004
[9/16929] DCPD-200900003
[10/16929] DCPD-200900055
[11/16929] DCPD-200900014
[12/16929] DCPD-200900013
[13/16929] DCPD-200900011
[14/16929] DCPD-200900007
[15/16929] DCPD-200900006
[16/16929] DCPD-200900005
[17/16929] DCPD-200900016
[18/16929] DCPD-200900015
[19/16929] DCPD-200900018
[20/16929] DCPD-200900017
[21/16929] WCPD-2009-01-26
[22/16929] DCPD-200900025
[23/16929] DCPD-200900024
[24/16929] DCPD-200900023
[25/16929] DCPD-200900022
[26/16929] DCPD-200900021
[27/16929] DCPD-200900020
[28/16929] DCPD-200900019
[29/16929] DCPD-200900051
[30/16929] DCPD-200900026
[31/16929] DCPD-200900032
[32/16929] DCPD-200900031
[33/16929] DCPD-200900030
[34/16929] DCPD-200900036
[35/16929] DCPD-200900035
[36/16929] DCPD-200900034
[37/16929] DCPD-200900033
[38/16929] DCPD-200900287
[39/16929] DCPD-2009

Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats
Could get FontBBox from font descriptor because None cannot be parsed as 4 floats


[3908/16929] DCPD-201200803
[3909/16929] DCPD-201200802
[3910/16929] DCPD-201200801
[3911/16929] DCPD-201200800
[3912/16929] DCPD-201200804
[3913/16929] DCPD-201200805
[3914/16929] DCPD-201200806
[3915/16929] DCPD-201200808
[3916/16929] DCPD-201200807
[3917/16929] DCPD-201200809
[3918/16929] DCPD-201200813
[3919/16929] DCPD-201200812
[3920/16929] DCPD-201200811
[3921/16929] DCPD-201200810
[3922/16929] DCPD-201200815
[3923/16929] DCPD-201200814
[3924/16929] DCPD-201200818
[3925/16929] DCPD-201200817
[3926/16929] DCPD-201200816
[3927/16929] DCPD-201200819
[3928/16929] DCPD-201200820
[3929/16929] DCPD-201200821
[3930/16929] DCPD-201200823
[3931/16929] DCPD-201200822
[3932/16929] DCPD-201200829
[3933/16929] DCPD-201200828
[3934/16929] DCPD-201200827
[3935/16929] DCPD-201200826
[3936/16929] DCPD-201200825
[3937/16929] DCPD-201200824
[3938/16929] DCPD-201200832
[3939/16929] DCPD-201200831
[3940/16929] DCPD-201200830
[3941/16929] DCPD-201200834
[3942/16929] DCPD-201200833
[3943/16929] DCPD-20

In [8]:
# Convert corpus_results.txt to DataFrame
# The file is in JSONL format (one JSON object per line)
cpd_data = []

with open('corpus_results.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if line:  # Skip empty lines
            cpd_data.append(json.loads(line))

cpd_df = pd.DataFrame(cpd_data)

print(f"CPD Corpus DataFrame shape: {cpd_df.shape}")
print(f"\nColumns: {cpd_df.columns.tolist()}")
print(f"\nFirst few rows:")
cpd_df.head()


CPD Corpus DataFrame shape: (16926, 4)

Columns: ['packageId', 'title', 'dateIssued', 'text']

First few rows:


Unnamed: 0,packageId,title,dateIssued,text
0,DCPD-200900002,Proclamation 8343-National Day of Renewal and ...,2009-01-20,"Administration of Barack H. Obama, 2009\nProcl..."
1,DCPD-200900001,Inaugural Address,2009-01-20,"Administration of Barack H. Obama, 2009\nInaug..."
2,DCPD-200900054,Statement on a Meeting on the Situation in Iraq,2009-01-21,"Administration of Barack H. Obama, 2009\nState..."
3,DCPD-200900012,Remarks to White House Senior Staff,2009-01-21,"Administration of Barack H. Obama, 2009\nRemar..."
4,DCPD-200900010,Memorandum on Transparency and Open Government,2009-01-21,"Administration of Barack H. Obama, 2009\nMemor..."


## 2. Department of State corpus

In [4]:
import pandas as pd
import json


In [3]:
with open('all_biden_press_statements_sec_of_state_v2.json', 'r', encoding='utf-8') as f:
    biden_data = json.load(f)

with open('all_obama_press_statements_sec_of_state_v2.json', 'r', encoding='utf-8') as f:
    obama_data = json.load(f)

with open('all_trump_press_statements_sec_of_state.json', 'r', encoding='utf-8') as f:
    trump_data = json.load(f)

print(f"Biden: {len(biden_data)}")
print(f"Obama: {len(obama_data)}")
print(f"Trump: {len(trump_data)}")


Biden: 7737
Obama: 15712
Trump: 7464


In [5]:
biden_df = pd.DataFrame(biden_data)
biden_df['president'] = 'Biden'

obama_df = pd.DataFrame(obama_data)
obama_df['president'] = 'Obama'

trump_df = pd.DataFrame(trump_data)
trump_df['president'] = 'Trump'

print("Biden DataFrame shape:", biden_df.shape)
print("Obama DataFrame shape:", obama_df.shape)
print("Trump DataFrame shape:", trump_df.shape)
print("\ncolumn names:", biden_df.columns.tolist())


Biden DataFrame shape: (7737, 12)
Obama DataFrame shape: (15712, 11)
Trump DataFrame shape: (7464, 12)

column names: ['type_of_release', 'title_of_release', 'link', 'date', 'page_url', 'title', 'document_type', 'document_author', 'publish_date', 'text', 'tags', 'president']


In [7]:
# merge three presidents' data
merged_df = pd.concat([biden_df, obama_df, trump_df], ignore_index=True)
print(f"merged df shape: {merged_df.shape}")
merged_df.head()


merged df shape: (30913, 14)


Unnamed: 0,type_of_release,title_of_release,link,date,page_url,title,document_type,document_author,publish_date,text,tags,president,document_author_name,document_author_title
0,Media Note,"Under Secretary Fernandez’s Travel to Chile, U...",https://www.state.gov/under-secretary-fernande...,"November 4, 2023",https://www.state.gov/press-releases/?results=...,"Under Secretary Fernandez’s Travel to Chile, U...",Media Note,Office of the Spokesperson,"November 4, 2023","[Under Secretary of State for Economic Growth,...","[{'tag_name': 'Bureau of Energy Resources', 't...",Biden,,
1,Media Note,Assistant Secretary of State for the Bureau of...,https://www.state.gov/assistant-secretary-of-s...,"November 4, 2023",https://www.state.gov/press-releases/page/2/?r...,Assistant Secretary of State for the Bureau of...,Media Note,Office of the Spokesperson,"November 4, 2023",[Assistant Secretary of State for South and Ce...,"[{'tag_name': 'Bureau of Democracy, Human Righ...",Biden,,
2,Press Statement,"Secretary Blinken’s Travel to Tel Aviv, Amman,...",https://www.state.gov/secretary-blinkens-trave...,"November 4, 2023",https://www.state.gov/press-releases/page/2/?r...,"Secretary Blinken’s Travel to Tel Aviv, Amman,...",Press Statement,"Matthew Miller, Department Spokesperson","November 4, 2023",[Secretary of State Antony J. Blinken will tra...,[{'tag_name': 'Bureau of East Asian and Pacifi...,Biden,,
3,Readout,Secretary Blinken’s Meeting with Lebanese Care...,https://www.state.gov/secretary-blinkens-meeti...,"November 4, 2023",https://www.state.gov/press-releases/page/2/?r...,Secretary Blinken’s Meeting with Lebanese Care...,Readout,Office of the Spokesperson,"November 4, 2023",[The below is attributable to Spokesperson Mat...,[{'tag_name': 'Bureau of Near Eastern Affairs'...,Biden,,
4,Media Note,Assistant Secretary Phee’s Travel to Namibia,https://www.state.gov/assistant-secretary-phee...,"November 4, 2023",https://www.state.gov/press-releases/page/2/?r...,Assistant Secretary Phee’s Travel to Namibia,Media Note,Office of the Spokesperson,"November 4, 2023",[Assistant Secretary of State for African Affa...,"[{'tag_name': 'Bureau of African Affairs', 'ta...",Biden,,


## 3. Processing

In [5]:
import re
import numpy as np


In [None]:

# ----- 1) Standardize columns -----

# CPD
df_cpd_std = cpd_df.rename(columns={
    "dateIssued": "date"
})
df_cpd_std["source"] = "cpd"

# State
df_state_std = merged_df
df_state_std["source"] = "state"

# Ensure both have: source, title, date, text
df_cpd_std = df_cpd_std[["source", "packageId", "title", "date", "text"]]
df_state_std["packageId"] = np.nan  # no packageId for State
df_state_std = df_state_std[["source", "packageId", "title", "date", "text"]]

# Combine
df_all = pd.concat([df_cpd_std, df_state_std], ignore_index=True)

# Parse dates - handle mixed formats (ISO format from CPD and text format from State Dept)
df_all["date"] = pd.to_datetime(df_all["date"], format='mixed', errors='coerce')

print("Total docs:", len(df_all))

In [12]:
print(df_all.tail())

      source packageId                                              title  \
47834  state       NaN                           Betico Croes Day Message   
47835  state       NaN  The Bureau of Overseas Buildings Operations An...   
47836  state       NaN  The Bureau of Overseas Buildings Operations An...   
47837  state       NaN  Assistant Secretary Robinson Travels to the Un...   
47838  state       NaN  United States Welcomes Transition of Power in ...   

            date                                               text  
47834 2017-01-25  [Press Statement, Thomas A. Shannon, Jr., Acti...  
47835 2017-01-24  [Media Note, Office of the Spokesperson, Washi...  
47836 2017-01-24  [Media Note, Office of the Spokesperson, Washi...  
47837 2017-01-23  [Media Note, Office of the Spokesperson, Washi...  
47838 2017-01-22  [Press Statement, Mark C. Toner, Acting Spokes...  


In [13]:
'''
cleaning text
'''
def clean_cpd_text(raw: str) -> str:
    if not isinstance(raw, str):
        return ""
    # Cut trailing metadata
    cut_markers = ["\nCategories:\n", "\nDCPD Number:\n"]
    for m in cut_markers:
        if m in raw:
            raw = raw.split(m)[0]

    lines = raw.splitlines()

    # Remove header "Administration of ..."
    if lines and lines[0].startswith("Administration of"):
        lines = lines[3:] if len(lines) > 3 else []

    # Remove weird all-caps chunks
    cleaned_lines = []
    for ln in lines:
        if re.fullmatch(r"[A-Z\n ]{3,}", ln.strip()):
            continue
        cleaned_lines.append(ln)

    text = " ".join(cleaned_lines)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def clean_state_text(raw: str) -> str:
    if not isinstance(raw, str):
        return ""
    # basic whitespace cleanup; you can add more rules if needed
    text = re.sub(r"\s+", " ", raw).strip()
    return text

def clean_row(row):
    if row["source"] == "cpd":
        return clean_cpd_text(row["text"])
    else:
        return clean_state_text(row["text"])


In [23]:
df_all = df_all.sort_values(by="date").reset_index(drop=True)

In [24]:
df_all.to_csv("all_documents.csv", index=False)

In [25]:
df_all_cleaned = df_all[df_all["date"].notna() & (df_all["date"].astype(str).str.strip() != "")]
df_all_cleaned.to_csv("all_documents_cleaned.csv", index=False)

In [7]:
import pandas as pd
df_all_cleaned = pd.read_csv("all_documents_cleaned.csv")

In [8]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download('all', halt_on_error=False)

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/ziyu/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /Users/ziyu/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/ziyu/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /Users/ziyu/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/ziyu/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]   

True

In [9]:
# stopwords
from nltk.corpus import stopwords

# tokenizer
from nltk.tokenize import word_tokenize

# lemmatizer
from nltk.stem import WordNetLemmatizer

# stemming
from nltk.stem.porter import PorterStemmer

import re

from nltk import sent_tokenize, pos_tag, ne_chunk

from nltk.sentiment import SentimentIntensityAnalyzer


In [10]:
def preprocess_text(text):
    
    # increase stop words
    stop_words = stopwords.words('english')
    stop_words = stop_words + ["https", "rt", "amp"]
    
    # tokenization 
    tokens_ = word_tokenize(text)
    
    # Generate a list of tokens after preprocessing
 
    # normalize
    tokens_ = [word.lower() for word in tokens_ if word.isalpha()]

    # stem and stopwords
    
    # instatiate the stemmer
    porter = PorterStemmer()

    tokens_ =  [porter.stem(word) for word in tokens_ if word not in stop_words]
    # Return the preprocessed tokens as a string
    return tokens_

In [41]:
# apply preprocessing to clean 
df_all_cleaned["tokens"] = df_all_cleaned["clean_text"].apply(preprocess_text)
df_all_cleaned.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_all_cleaned["tokens"] = df_all_cleaned["clean_text"].apply(preprocess_text)


Unnamed: 0,source,packageId,title,date,text,clean_text,tokens
0,cpd,DCPD-200900002,Proclamation 8343-National Day of Renewal and ...,2009-01-20,"Administration of Barack H. Obama, 2009\nProcl...",By the President of the United States of Ameri...,"[presid, unit, state, america, proclam, take, ..."
1,cpd,DCPD-200900001,Inaugural Address,2009-01-20,"Administration of Barack H. Obama, 2009\nInaug...","My fellow citizens, I stand here today humbled...","[fellow, citizen, stand, today, humbl, task, u..."
2,cpd,DCPD-200900054,Statement on a Meeting on the Situation in Iraq,2009-01-21,"Administration of Barack H. Obama, 2009\nState...",This afternoon I met with our Ambassador to Ir...,"[afternoon, met, ambassador, iraq, command, ir..."
3,cpd,DCPD-200900012,Remarks to White House Senior Staff,2009-01-21,"Administration of Barack H. Obama, 2009\nRemar...","The President . Hello, everybody. Please be se...","[presid, hello, everybodi, pleas, seat, still,..."
4,cpd,DCPD-200900010,Memorandum on Transparency and Open Government,2009-01-21,"Administration of Barack H. Obama, 2009\nMemor...",Memorandum for the Heads of Executive Departme...,"[memorandum, head, execut, depart, agenc, subj..."


In [11]:
sia = SentimentIntensityAnalyzer()

# China terms & patterns
china_terms = [
    "china", "chinese", "beijing", "prc",
    "people's republic of china", "xi jinping","hu jintao",
    "u.s.-china", "us-china", "u.s. china", "us china",
    "sino-american", "sino-us", "sino-u.s.","taiwan"
]
china_pattern = re.compile("|".join(re.escape(t) for t in china_terms), re.IGNORECASE)

# Hedge & assertive wordlists
hedge_words = {"may", "might", "could", "perhaps", "possibly","i think", "we think", "we believe", "likely","not sure", "uncertain", "it seems"}
assertive_words = {"will", "must", "shall", "certainly", "clearly","unequivocal", "committed", "we will", "we must"}

def extract_doc_features(text: str):
    '''
    build the document level features as X for the forecasting
    '''
    if not isinstance(text, str) or not text.strip():
        return {
            "n_tokens": 0,
            "n_sentences": 0,
            "num_person": 0,
            "num_org": 0,
            "num_gpe": 0,
            "hedge_density": 0.0,
            "assertive_density": 0.0,
            "sentiment_polarity": 0.0,
            "sentiment_intensity": 0.0,
            "has_china": 0,
            "china_mentions": 0
        }

    # Sentence + tokenization
    sentences = sent_tokenize(text)
    tokens = word_tokenize(text)
    n_sents = len(sentences)
    n_tokens = len(tokens) if tokens else 1

    # POS tag + NER (nltk.ne_chunk)
    tagged = pos_tag(tokens)
    chunks = ne_chunk(tagged, binary=False)

    num_person = 0
    num_org = 0
    num_gpe = 0
    for chunk in chunks:
        if hasattr(chunk, "label"):
            label = chunk.label()
            if label == "PERSON":
                num_person += 1
            elif label == "ORGANIZATION":
                num_org += 1
            elif label == "GPE":
                num_gpe += 1

    # Hedge / assertive density
    text_lower = text.lower()
    hedge_count = sum(text_lower.count(w) for w in hedge_words)
    assertive_count = sum(text_lower.count(w) for w in assertive_words)

    hedge_density = hedge_count / n_tokens
    assertive_density = assertive_count / n_tokens

    # China intensity
    china_matches = china_pattern.findall(text)
    has_china = int(len(china_matches) > 0)
    china_mentions = len(china_matches)

    # Sentiment via VADER
    scores = sia.polarity_scores(text)
    pol = scores["compound"]           # ∈ [-1, 1]
    intensity = abs(pol)

    return {
        "n_tokens": n_tokens,
        "n_sentences": n_sents,
        "num_person": num_person,
        "num_org": num_org,
        "num_gpe": num_gpe,
        "hedge_density": hedge_density,
        "assertive_density": assertive_density,
        "sentiment_polarity": pol,
        "sentiment_intensity": intensity,
        "has_china": has_china,
        "china_mentions": china_mentions
    }


In [14]:
feature_rows = df_all_cleaned["clean_text"].apply(extract_doc_features)

feat_df = pd.DataFrame(list(feature_rows))

df_features = pd.concat([df_all_cleaned.reset_index(drop=True), 
                         feat_df.reset_index(drop=True)], axis=1)

df_features.head()


Unnamed: 0,source,packageId,title,date,text,clean_text,n_tokens,n_sentences,num_person,num_org,num_gpe,hedge_density,assertive_density,sentiment_polarity,sentiment_intensity,has_china,china_mentions
0,cpd,DCPD-200900002,Proclamation 8343-National Day of Renewal and ...,2009-01-20,"Administration of Barack H. Obama, 2009\nProcl...",By the President of the United States of Ameri...,385,10,3,7,11,0.002597,0.005195,0.9833,0.9833,0,0
1,cpd,DCPD-200900001,Inaugural Address,2009-01-20,"Administration of Barack H. Obama, 2009\nInaug...","My fellow citizens, I stand here today humbled...",2731,124,9,6,22,0.002563,0.017942,0.9997,0.9997,0,0
2,cpd,DCPD-200900054,Statement on a Meeting on the Situation in Iraq,2009-01-21,"Administration of Barack H. Obama, 2009\nState...",This afternoon I met with our Ambassador to Ir...,176,6,1,4,7,0.0,0.017045,0.9346,0.9346,0,0
3,cpd,DCPD-200900012,Remarks to White House Senior Staff,2009-01-21,"Administration of Barack H. Obama, 2009\nRemar...","The President . Hello, everybody. Please be se...",1810,89,9,7,19,0.00221,0.01326,0.9998,0.9998,0,0
4,cpd,DCPD-200900010,Memorandum on Transparency and Open Government,2009-01-21,"Administration of Barack H. Obama, 2009\nMemor...",Memorandum for the Heads of Executive Departme...,540,23,4,10,3,0.0,0.011111,0.9966,0.9966,0,0


In [None]:
df_features.to_csv("document_features.csv", index=False)

In [1]:
import pandas as pd

In [2]:
df_features = pd.read_csv("document_features.csv")

In [3]:
df_features_china = df_features[(df_features["has_china"] == 1)]

len(df_features_china)

2459

In [4]:
df_features_china.to_csv("features_china.csv")