In [1]:
# Dependencies
import pandas as pd

In [2]:
# Name of the CSV file
file = 'Resources/donors2021_unclean.csv'

In [14]:
x = pd.read_csv(file)
x=x.dropna(how="any")
x.count()
x

Unnamed: 0,Name,Employer,City,State,Zip,Amount,Memo_CD


In [38]:
x = pd.read_csv(file)
x=x.fillna(x['Memo_CD'],0)
x.count()


ValueError: Cannot specify both 'value' and 'method'.

In [3]:
# The correct encoding must be used to read the CSV in pandas
df = pd.read_csv(file)

In [4]:
# Preview of the DataFrame
# Note that Memo_CD is likely a meaningless column
df.head()

Unnamed: 0,Name,Employer,City,State,Zip,Amount,Memo_CD
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618.0,500,
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250,
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250,
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000,
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250,


Series([], Name: Memo_CD, dtype: int64)

In [6]:
# Delete extraneous column
del df['Memo_CD']
df.head()

Unnamed: 0,Name,Employer,City,State,Zip,Amount
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618.0,500
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286.0,250
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221.0,250
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023.0,1000
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214.0,250


In [7]:
# Identify incomplete rows
df.count()

Name        2000
Employer    1820
City        1999
State       1999
Zip         1996
Amount      2000
dtype: int64

In [9]:
# Drop all rows with missing information
df = df.dropna(how='any')
df.count()

Name        1818
Employer    1818
City        1818
State       1818
Zip         1818
Amount      1818
dtype: int64

In [10]:
# Verify dropped rows
df.count()

Name        1818
Employer    1818
City        1818
State       1818
Zip         1818
Amount      1818
dtype: int64

In [17]:
# The Zip column is the wrong data type. It should be a string (object).
df.dtypes

Name         object
Employer     object
City         object
State        object
Zip         float64
Amount        int64
dtype: object

In [21]:
# Use df.astype() method to convert the datatype of the Zip column
df = df.astype({"Zip": int})
df = df.astype({"Zip": str})
df

Unnamed: 0,Name,Employer,City,State,Zip,Amount
0,"CAREY, JAMES",NOT EMPLOYED,HOCKESSIN,DE,197071618,500
1,"OBICI, SILVANA",STONY BROOK,PORT JEFFERSON STATION,NY,117764286,250
2,"MAISLIN, KAREN",RETIRED,WILLIAMSVILLE,NY,14221,250
3,"MCCLELLAND, CARTER AND STEPHANIE",UNION SQUARE ADVISORS,NEW YORK,NY,10023,1000
4,"MCCLUSKEY, MARTHA",STATE UNIVERSITY OF NEW YORK,BUFFALO,NY,14214,250
...,...,...,...,...,...,...
1995,"KROMHOUT, WILEEN",UCLA,PORTER RANCH,CA,913263018,57
1996,"KRONHEIM, NANCY",NOT EMPLOYED,ARLINGTON,VA,222041332,100
1997,"KROOP, STEVE","FLYTEC, USA",HIGH SPRINGS,FL,326431608,39
1998,"KRSEK, PAUL","5T WEALTH, LLC",NAPA,CA,945581818,20


In [24]:
# Verify that the Zip column datatype has been made an object
df.dtypes

Name        object
Employer    object
City        object
State       object
Zip         object
Amount       int64
dtype: object

In [25]:
# Display an overview of the Employers column
df['Employer'].value_counts()

NOT EMPLOYED            609
NONE                    321
SELF-EMPLOYED           132
SELF                     33
RETIRED                  32
                       ... 
CORNELL UNIVERSITY        1
URBAN PHILANTHROPIES      1
UNITY CONSULTING          1
MOEBIUS MODELS            1
FORM4 ARCHITECTURE        1
Name: Employer, Length: 519, dtype: int64

In [26]:
# Clean up Employer category. Replace 'SELF' and 'SELF EMPLOYED' with 'SELF-EMPLOYED'
df['Employer'] = df['Employer'].replace({'SELF': 'SELF-EMPLOYED', 'SELF EMPLOYED': 'SELF-EMPLOYED'})

In [27]:
# Verify clean-up.
df['Employer'].value_counts()

NOT EMPLOYED            609
NONE                    321
SELF-EMPLOYED           180
RETIRED                  32
INGRAM BARGE COMPANY     30
                       ... 
CORNELL UNIVERSITY        1
URBAN PHILANTHROPIES      1
UNITY CONSULTING          1
MOEBIUS MODELS            1
FORM4 ARCHITECTURE        1
Name: Employer, Length: 517, dtype: int64

In [28]:
df['Employer'] = df['Employer'].replace({'NOT EMPLOYED': 'UNEMPLOYED'})
df['Employer'].value_counts()

UNEMPLOYED              611
NONE                    321
SELF-EMPLOYED           180
RETIRED                  32
INGRAM BARGE COMPANY     30
                       ... 
CORNELL UNIVERSITY        1
URBAN PHILANTHROPIES      1
UNITY CONSULTING          1
MOEBIUS MODELS            1
FORM4 ARCHITECTURE        1
Name: Employer, Length: 516, dtype: int64

In [29]:
# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
df.describe()

Unnamed: 0,Amount
count,1818.0
mean,752.127613
std,11601.791128
min,-1000.0
25%,25.0
50%,50.0
75%,200.0
max,400000.0


In [30]:
df.to_csv("Resources/donors2021.csv", index=False, encoding="ISO-8859-1")