In [1]:
# Dependencies
import pandas as pd

In [2]:
# Name of the CSV file
file = 'Population.csv'

In [3]:
# The correct encoding must be used to read the CSV in pandas
df = pd.read_csv(file, encoding="ISO-8859-1")

In [4]:
# Preview of the DataFrame
# Note that FIELD8 is likely a meaningless column
df.head()

Unnamed: 0,"ï»¿""State""",Density,Pop,LandArea
0,New Jersey,1215.1985,8936570,7354
1,Rhode Island,1021.4313,1056160,1034
2,Massachusetts,894.4359,6976600,7800
3,Connecticut,735.8695,3563080,4842
4,Maryland,626.6735,6083120,9707


In [5]:
df.columns

Index(['ï»¿"State"', 'Density', 'Pop', 'LandArea'], dtype='object')

In [6]:
df.columns = ["state", "Population_Density(p/mi^2)", "Population", "LandArea(mi^2)"]
df.head()

Unnamed: 0,state,Population_Density(p/mi^2),Population,LandArea(mi^2)
0,New Jersey,1215.1985,8936570,7354
1,Rhode Island,1021.4313,1056160,1034
2,Massachusetts,894.4359,6976600,7800
3,Connecticut,735.8695,3563080,4842
4,Maryland,626.6735,6083120,9707


In [7]:
# Delete extraneous column
del df['LandArea(mi^2)']
df.head()

Unnamed: 0,state,Population_Density(p/mi^2),Population
0,New Jersey,1215.1985,8936570
1,Rhode Island,1021.4313,1056160
2,Massachusetts,894.4359,6976600
3,Connecticut,735.8695,3563080
4,Maryland,626.6735,6083120


In [8]:
# Identify incomplete rows
df.count()

state                         50
Population_Density(p/mi^2)    50
Population                    50
dtype: int64

In [9]:
# Drop all rows with missing information
df = df.dropna(how='any')

In [10]:
# Verify dropped rows
df.count()

state                         50
Population_Density(p/mi^2)    50
Population                    50
dtype: int64

In [11]:
# Check data type.
df.dtypes

state                          object
Population_Density(p/mi^2)    float64
Population                      int64
dtype: object

In [12]:
# Display an overview of the Location column
df['state'].value_counts()

Tennessee         1
Vermont           1
Connecticut       1
Hawaii            1
Ohio              1
Wyoming           1
South Carolina    1
Oklahoma          1
Delaware          1
Maine             1
Illinois          1
West Virginia     1
California        1
Louisiana         1
Missouri          1
Minnesota         1
Florida           1
Nevada            1
Mississippi       1
Kansas            1
Arkansas          1
Texas             1
Montana           1
New York          1
North Dakota      1
Michigan          1
Indiana           1
Georgia           1
South Dakota      1
New Hampshire     1
Utah              1
Colorado          1
Massachusetts     1
Pennsylvania      1
Alabama           1
North Carolina    1
Wisconsin         1
Iowa              1
Oregon            1
Washington        1
New Jersey        1
Alaska            1
Idaho             1
Arizona           1
Nebraska          1
Rhode Island      1
Kentucky          1
New Mexico        1
Virginia          1
Maryland          1


In [13]:
# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
df.describe()

Unnamed: 0,Population_Density(p/mi^2),Population
count,50.0,50.0
mean,203.90098,6611970.0
std,267.412813,7480029.0
min,1.2863,567025.0
25%,47.70675,1857762.0
50%,107.7835,4572435.0
75%,219.5669,7692448.0
max,1215.1985,39937500.0


In [14]:
df.to_csv('Population_clean.csv', index = False)
