In [19]:
import pandas as pd
import requests
import datetime
import zipfile

## Programmatically downloading zipped file of Individual Contributions
For now, I am storing all the files locally but, in production I will store these files in an S3 bucket.

### Step 1. Download the zipped file from FEC website and place it in the /zipped folder.

In [2]:
url = "http://www.fec.gov/files/bulk-downloads/2024/indiv24.zip" #Link to zip file containing all individual contritibutions for the 2024 election cycle
current_date = datetime.datetime.now()
current_date_fmt = current_date.strftime("%Y-%m-%d")

output_path = f"./zipped/ic24_{current_date_fmt}.zip"
file_req = requests.get(url)

with open(output_path,'wb') as output_file:
    output_file.write(file_req.content)

print(f"File succefully added to {output_path} ")

File succefully added to ./zipped/ic24_2024-02-01.zip 


### Step 2. Extract all the contents of the zipped file into the "/unzipped/ic24_{Current Date}" folder.

In [3]:
extract_path = f"./unzipped/ic24_{current_date_fmt}/"
with zipfile.ZipFile(output_path) as extract_zip:
    extract_zip.extractall(extract_path)

print(f"Files extracted to path: {extract_path}")

Files extracted to path: ./unzipped/ic24_2024-02-01/


### Step 3. Add the column headers to the individual contribution file and convert the .txt file to a .csv.

In [7]:
header_df = pd.read_csv("./indiv_header_file.csv")
# print(header_df)
df = pd.read_csv(extract_path + "itcont.txt", sep="|", names=header_df.columns)
display(df.head(n=10))
print(f"Total number of individual contributions recorded: {len(df)}")

  df = pd.read_csv(extract_path + "itcont.txt", sep="|", names=header_df.columns)


Unnamed: 0,CMTE_ID,AMNDT_IND,RPT_TP,TRANSACTION_PGI,IMAGE_NUM,TRANSACTION_TP,ENTITY_TP,NAME,CITY,STATE,...,EMPLOYER,OCCUPATION,TRANSACTION_DT,TRANSACTION_AMT,OTHER_ID,TRAN_ID,FILE_NUM,MEMO_CD,MEMO_TEXT,SUB_ID
0,C00817940,T,TER,P,202301239574900260,22Y,IND,"DUFF, JAMES",LOS ANGELES,CA,...,,,1192023,800,,500241283,1678288,,,4012320231720334166
1,C00817940,T,TER,P,202301239574900260,22Y,IND,"ROTHMAN, ANDREW",CULVER CITY,CA,...,,,1192023,250,,500241284,1678288,,,4012320231720334167
2,C00817940,T,TER,P,202301239574900260,22Y,IND,"PELLETT, CLARK",CHICAGO,IL,...,,,1202023,345,,500241285,1678288,,,4012320231720334168
3,C00817940,T,TER,P,202301239574900261,22Y,IND,"DISNEY, TIMOTHY",ENCINO,CA,...,,,1192023,615,,500241286,1678288,,,4012320231720334169
4,C00817940,T,TER,P,202301239574900261,22Y,IND,"TRONE, DAVID",POTOMAC,MD,...,,,1192023,5000,,500241287,1678288,,,4012320231720334170
5,C00817940,T,TER,P,202301239574900261,22Y,IND,"MISCIKOWSKI, CYNTHIA",LOS ANGELES,CA,...,,,1192023,5000,,500241288,1678288,,,4012320231720334171
6,C00817940,T,TER,P,202301239574900262,22Y,IND,"HOOD, JOHN",DEL MAR,CA,...,,,1202023,5000,,500241289,1678288,,,4012320231720334172
7,C00815316,T,TER,P,202301239574918787,20Y,IND,"DANFORTH, JOHN",NEW YORK,NY,...,,,1232023,379827,,SB28A.4281,1678556,,,4012420231720406200
8,C00806752,T,TER,P2022,202211189546828121,15,CAN,"STEEL, DIANE",LAS VEGAS,NV,...,NONE,RETIRED,6302022,7709,,SA11AI.4337,1661961,,,4120820221633698139
9,C00806752,T,TER,P2022,202211189546828120,15,ORG,AURELIA ARNOLD ROBERTS,LAS VEGAS,NV,...,,,4052022,580,,SA11AI.4276,1661961,,,4120820221633698136


Total number of individual contributions recorded: 9705390


## Minor data cleaning

In [8]:
print(df.dtypes)

CMTE_ID            object
AMNDT_IND          object
RPT_TP             object
TRANSACTION_PGI    object
IMAGE_NUM           int64
TRANSACTION_TP     object
ENTITY_TP          object
NAME               object
CITY               object
STATE              object
ZIP_CODE           object
EMPLOYER           object
OCCUPATION         object
TRANSACTION_DT      int64
TRANSACTION_AMT     int64
OTHER_ID           object
TRAN_ID            object
FILE_NUM            int64
MEMO_CD            object
MEMO_TEXT          object
SUB_ID              int64
dtype: object


The "TRANSATION_DT" is meant to have a DATETIME data type but, Pandas is reading that column as a integer. Let's fix that!

In [16]:
try:
    df["TRANSACTION_DT"] = pd.to_datetime(df["TRANSACTION_DT"], format="%m%d%Y", errors='raise')
    display(df["TRANSACTION_DT"].head(n=10))
except ValueError:
    pass

0   2023-11-09
1   2023-11-09
2   2023-01-20
3   2023-11-09
4   2023-11-09
5   2023-11-09
6   2023-01-20
7   2023-12-03
8   2022-06-30
9   2022-04-05
Name: TRANSACTION_DT, dtype: datetime64[ns]

In [17]:
print(df.dtypes)

CMTE_ID                    object
AMNDT_IND                  object
RPT_TP                     object
TRANSACTION_PGI            object
IMAGE_NUM                   int64
TRANSACTION_TP             object
ENTITY_TP                  object
NAME                       object
CITY                       object
STATE                      object
ZIP_CODE                   object
EMPLOYER                   object
OCCUPATION                 object
TRANSACTION_DT     datetime64[ns]
TRANSACTION_AMT             int64
OTHER_ID                   object
TRAN_ID                    object
FILE_NUM                    int64
MEMO_CD                    object
MEMO_TEXT                  object
SUB_ID                      int64
dtype: object


That is all I had to change, on to making the final source file.

## Creating a finalized source .csv

In [22]:
source_path = f"./source/ic_2024_{current_date_fmt}.csv"
df.to_csv(source_path, sep=",", index=False)
print(f"Source file for Individual Contributions has been created in {source_path}.")

Source file for Individual Contributions has been created in ./source/ic_2024_2024-02-01.csv.


In [23]:
final_csv = pd.read_csv("./source/ic_2024_2024-02-01.csv", sep=",")
display(final_csv.head(n=20))

  final_csv = pd.read_csv("./source/ic_2024_2024-02-01.csv", sep=",")


Unnamed: 0,CMTE_ID,AMNDT_IND,RPT_TP,TRANSACTION_PGI,IMAGE_NUM,TRANSACTION_TP,ENTITY_TP,NAME,CITY,STATE,...,EMPLOYER,OCCUPATION,TRANSACTION_DT,TRANSACTION_AMT,OTHER_ID,TRAN_ID,FILE_NUM,MEMO_CD,MEMO_TEXT,SUB_ID
0,C00817940,T,TER,P,202301239574900260,22Y,IND,"DUFF, JAMES",LOS ANGELES,CA,...,,,2023-11-09,800,,500241283,1678288,,,4012320231720334166
1,C00817940,T,TER,P,202301239574900260,22Y,IND,"ROTHMAN, ANDREW",CULVER CITY,CA,...,,,2023-11-09,250,,500241284,1678288,,,4012320231720334167
2,C00817940,T,TER,P,202301239574900260,22Y,IND,"PELLETT, CLARK",CHICAGO,IL,...,,,2023-01-20,345,,500241285,1678288,,,4012320231720334168
3,C00817940,T,TER,P,202301239574900261,22Y,IND,"DISNEY, TIMOTHY",ENCINO,CA,...,,,2023-11-09,615,,500241286,1678288,,,4012320231720334169
4,C00817940,T,TER,P,202301239574900261,22Y,IND,"TRONE, DAVID",POTOMAC,MD,...,,,2023-11-09,5000,,500241287,1678288,,,4012320231720334170
5,C00817940,T,TER,P,202301239574900261,22Y,IND,"MISCIKOWSKI, CYNTHIA",LOS ANGELES,CA,...,,,2023-11-09,5000,,500241288,1678288,,,4012320231720334171
6,C00817940,T,TER,P,202301239574900262,22Y,IND,"HOOD, JOHN",DEL MAR,CA,...,,,2023-01-20,5000,,500241289,1678288,,,4012320231720334172
7,C00815316,T,TER,P,202301239574918787,20Y,IND,"DANFORTH, JOHN",NEW YORK,NY,...,,,2023-12-03,379827,,SB28A.4281,1678556,,,4012420231720406200
8,C00806752,T,TER,P2022,202211189546828121,15,CAN,"STEEL, DIANE",LAS VEGAS,NV,...,NONE,RETIRED,2022-06-30,7709,,SA11AI.4337,1661961,,,4120820221633698139
9,C00806752,T,TER,P2022,202211189546828120,15,ORG,AURELIA ARNOLD ROBERTS,LAS VEGAS,NV,...,,,2022-04-05,580,,SA11AI.4276,1661961,,,4120820221633698136
