### Importing required libraries

In [1]:
import pandas as pd
import plotly.express as px

### Reading given text file into a dataframe

In [2]:
census_cost = pd.read_csv("census_costs.txt", sep="\t", skiprows=1)

In [3]:
census_cost.head()

Unnamed: 0,Census Year,Total Population,Census Cost,Average Cost Per Person
0,1790,3929214,"$44,377",1.13 cents
1,1800,5308483,"$66,109",1.24 cents
2,1810,7239881,"$178,445",2.46 cents
3,1820,9633822,"$208,526",2.16 cents
4,1830,12866020,"$378,545",2.94 cents


### Finding out how many rows and columns it has using "shape"

In [4]:
census_cost.shape

(23, 4)

### Displaying first 10 rows from the dataframe using "head()"

In [5]:
census_cost.head(10)

Unnamed: 0,Census Year,Total Population,Census Cost,Average Cost Per Person
0,1790,3929214,"$44,377",1.13 cents
1,1800,5308483,"$66,109",1.24 cents
2,1810,7239881,"$178,445",2.46 cents
3,1820,9633822,"$208,526",2.16 cents
4,1830,12866020,"$378,545",2.94 cents
5,1840,17069458,"$833,371",4.88 cents
6,1850,23191876,"$1,423,351",6.14 cents
7,1860,31443321,"$1,969,377",6.26 cents
8,1870,38558371,"$3,421,198",8.87 cents
9,1880,50155783,"$5,790,678",11.54 cents


### Displaying last 10 rows of the dataframe using "tail()"

In [6]:
census_cost.tail(10)

Unnamed: 0,Census Year,Total Population,Census Cost,Average Cost Per Person
13,1920,105710620,"$25,117,000",23.76 cents
14,1930,122775046,"$40,156,000",32.71 cents
15,1940,131669275,"$67,527,000",51.29 cents
16,1950,151325798,"$91,462,000",60.44 cents
17,1960,179323175,"$127,934,000",71.34 cents
18,1970,203302031,"$247,653,000",$1.22
19,1980,226542199,"$1,078,488,000",$4.76
20,1990,248718301,"$2,492,830,000",$10.02
21,2000,281421906,$4.5 Billion,$15.99
22,2010*,308745538,$13 Billion,$42.11


### Displaying 10 random rows from the dataframe using "sample()"

In [7]:
census_cost.sample(10)

Unnamed: 0,Census Year,Total Population,Census Cost,Average Cost Per Person
17,1960,179323175,"$127,934,000",71.34 cents
16,1950,151325798,"$91,462,000",60.44 cents
1,1800,5308483,"$66,109",1.24 cents
19,1980,226542199,"$1,078,488,000",$4.76
21,2000,281421906,$4.5 Billion,$15.99
3,1820,9633822,"$208,526",2.16 cents
22,2010*,308745538,$13 Billion,$42.11
0,1790,3929214,"$44,377",1.13 cents
4,1830,12866020,"$378,545",2.94 cents
9,1880,50155783,"$5,790,678",11.54 cents


### Finding out the datatypes of the columns in the dataframe using "info()"

In [8]:
census_cost.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23 entries, 0 to 22
Data columns (total 4 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Census Year              23 non-null     object
 1   Total Population         23 non-null     object
 2   Census Cost              23 non-null     object
 3   Average Cost Per Person  23 non-null     object
dtypes: object(4)
memory usage: 864.0+ bytes


### Summary statistics of the column using "describe()"

In [9]:
census_cost.describe()

Unnamed: 0,Census Year,Total Population,Census Cost,Average Cost Per Person
count,23,23,23,23
unique,23,23,23,23
top,1820,226542199,"$127,934,000",4.88 cents
freq,1,1,1,1


### Displaying the whole dataframe

In [10]:
census_cost

Unnamed: 0,Census Year,Total Population,Census Cost,Average Cost Per Person
0,1790,3929214,"$44,377",1.13 cents
1,1800,5308483,"$66,109",1.24 cents
2,1810,7239881,"$178,445",2.46 cents
3,1820,9633822,"$208,526",2.16 cents
4,1830,12866020,"$378,545",2.94 cents
5,1840,17069458,"$833,371",4.88 cents
6,1850,23191876,"$1,423,351",6.14 cents
7,1860,31443321,"$1,969,377",6.26 cents
8,1870,38558371,"$3,421,198",8.87 cents
9,1880,50155783,"$5,790,678",11.54 cents


### Cleansing the data to make the year, population, cost, cost per person numerical type

In [11]:
#Defined a function to cleanse the column "Census Year"
def Data_Cleansing_year(Year_string):
    i =  "*" in Year_string
    if i == True:
        year = Year_string.replace("*", "")
    else:
        year = Year_string

    return int(year)

#Defined a function to cleanse the column "Total Population"
def Data_Cleansing_Population(Pop):
    if "," in Pop:
        pop = Pop.replace(",","")
    else:
        pop = Pop
    return int(pop)

#Defined a function to cleanse the column "Census Cost"
def Data_Cleanse_Cost(Cost):
    cost1=Cost.replace("$","")
    cost2=cost1.replace(",","")
    if " Billion" in cost2:
        cost=int(float(cost2.replace(" Billion",""))*1000000000)
    else:
        cost=cost2
    return int(cost)

#Defined a function to cleanse the column "Average Cost Per Person"
def Cleanse_Avg_Cost(Avg_Cost):
    Average_Cost1=Avg_Cost.replace("$","")
    Average_Cost2=Average_Cost1.replace("\n","")
    if "cents" in Average_Cost2:
         Average_Cost3=float(Average_Cost2.replace(" cents",""))/100
    else:
         Average_Cost3=float(Average_Cost2)
    r = round(Average_Cost3, 5)    
    return (r)

### Using apply() method of data frame to apply the functions to the columns for data cleansing

In [12]:
census_cost["Census Year"] = census_cost["Census Year"].apply(Data_Cleansing_year)
census_cost["Total Population"] = census_cost["Total Population"].apply(Data_Cleansing_Population)
census_cost["Census Cost"] = census_cost["Census Cost"].apply(Data_Cleanse_Cost)
census_cost["Average Cost Per Person"] = census_cost["Average Cost Per Person"].apply(Cleanse_Avg_Cost)

### Datatypes got changed to numerical type

In [13]:
census_cost.dtypes

Census Year                  int64
Total Population             int64
Census Cost                  int64
Average Cost Per Person    float64
dtype: object

In [14]:
census_cost

Unnamed: 0,Census Year,Total Population,Census Cost,Average Cost Per Person
0,1790,3929214,44377,0.0113
1,1800,5308483,66109,0.0124
2,1810,7239881,178445,0.0246
3,1820,9633822,208526,0.0216
4,1830,12866020,378545,0.0294
5,1840,17069458,833371,0.0488
6,1850,23191876,1423351,0.0614
7,1860,31443321,1969377,0.0626
8,1870,38558371,3421198,0.0887
9,1880,50155783,5790678,0.1154


### Find out the summary statistics of the numerical columns - use describe() function

In [15]:
census_cost.describe()

Unnamed: 0,Census Year,Total Population,Census Cost,Average Cost Per Person
count,23.0,23.0,23.0,23.0
mean,1900.0,103921100.0,944558700.0,3.368739
std,67.8233,96226290.0,2830037000.0,9.278241
min,1790.0,3929214.0,44377.0,0.0113
25%,1845.0,20130670.0,1128361.0,0.0551
50%,1900.0,76303390.0,11854000.0,0.1707
75%,1955.0,165324500.0,109698000.0,0.6589
max,2010.0,308745500.0,13000000000.0,42.11


### Use Plotly to plot the average cost per person over time.

In [19]:
fig = px.line(census_cost, x="Census Year", y="Census Cost")
fig.show()

### Use Plotly to plot the scatter plot of Total Population vs Total Cost.

In [20]:
fig = px.scatter(census_cost, x="Census Year", y="Census Cost")
fig.show()