In [1]:
# Dependencies
import pandas as pd

In [2]:
# Name of the CSV file
file = 'Resources/donors2021_unclean.csv'

In [3]:
# The correct encoding must be used to read the CSV in pandas
df = pd.read_csv(file, encoding="ISO-8859-1")

In [4]:
# Preview of the DataFrame
# Note that Memo_CD is likely a meaningless column
df.head()

Unnamed: 0,Name,Employer,City,State,Zip,Amount,Memo_CD
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618.0,500,
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250,
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250,
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000,
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250,


In [5]:
# Delete extraneous column
del df['Memo_CD']
df.head()

Unnamed: 0,Name,Employer,City,State,Zip,Amount
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618.0,500
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250


In [6]:
# Identify incomplete rows
df.count()

Name        2000
Employer    1820
City        1999
State       1999
Zip         1996
Amount      2000
dtype: int64

In [7]:
# Drop all rows with missing information
df = df.dropna(how='any')

In [8]:
# Verify dropped rows
df.count()

Name        1818
Employer    1818
City        1818
State       1818
Zip         1818
Amount      1818
dtype: int64

In [9]:
# The Zip column is the wrong data type. It should be a string (object).
df.dtypes

Name         object
Employer     object
City         object
State        object
Zip         float64
Amount        int64
dtype: object

In [10]:
# Use df.astype() method to convert the datatype of the Zip column
df = df.astype({"Zip": str}, errors='raise')

In [11]:
# Verify that the Zip column datatype has been made an object
df['Zip'].dtype

dtype('O')

In [12]:
# Display an overview of the Employers column
df['Employer'].value_counts()

NOT EMPLOYED           609
NONE                   321
SELF-EMPLOYED          132
SELF                    33
RETIRED                 32
                      ... 
INTEL CORPORATION        1
SLOCUM & SONS            1
OCPS                     1
HEALTHCARE PARTNERS      1
CARBON FIVE              1
Name: Employer, Length: 519, dtype: int64

In [13]:
# Clean up Employer category. Replace 'SELF' and 'SELF EMPLOYED' with 'SELF-EMPLOYED'
df['Employer'] = df['Employer'].replace({'SELF': 'SELF-EMPLOYED', 'SELF EMPLOYED': 'SELF-EMPLOYED'})

In [14]:
# Verify clean-up.
df['Employer'].value_counts()

NOT EMPLOYED            609
NONE                    321
SELF-EMPLOYED           180
RETIRED                  32
INGRAM BARGE COMPANY     30
                       ... 
SLOCUM & SONS             1
OCPS                      1
HEALTHCARE PARTNERS       1
SEATTLE CITY LIGHT        1
CARBON FIVE               1
Name: Employer, Length: 517, dtype: int64

In [15]:
df['Employer'] = df['Employer'].replace({'NOT EMPLOYED': 'UNEMPLOYED'})
df['Employer'].value_counts()

UNEMPLOYED              611
NONE                    321
SELF-EMPLOYED           180
RETIRED                  32
INGRAM BARGE COMPANY     30
                       ... 
OCPS                      1
HEALTHCARE PARTNERS       1
ICARE MGT                 1
SEATTLE CITY LIGHT        1
CARBON FIVE               1
Name: Employer, Length: 516, dtype: int64

In [16]:
# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
df.describe()

Unnamed: 0,Amount
count,1818.0
mean,752.127613
std,11601.791128
min,-1000.0
25%,25.0
50%,50.0
75%,200.0
max,400000.0


In [17]:
df.to_csv("Resources/donors2021.csv", index=False, encoding="ISO-8859-1")