In [None]:
# Install the following modules
!pip install pandas_datareader

In [None]:
# Install the openpyxl package to handle excel files properly
!pip install openpyxl

In [None]:
import warnings
warnings.simplefilter("ignore")

In [None]:
# Clears all variable values previously set
from IPython import get_ipython
get_ipython().magic('reset -sf')

In [None]:
!pip install numpy

In [None]:
!pip install pandas

In [1]:
# Provides ways to work with large multidimensional arrays
import numpy as np 
# Allows for further data manipulation and analysis
import pandas as pd

# In Anaconda -> Environments -> Not Installed -> pandas-datareader -> Apply
from pandas_datareader import data as web # Reads stock data 
import matplotlib.pyplot as plt # Plotting
import matplotlib.dates as mdates # Styling dates
%matplotlib inline

# Pandas

Pandas provides numerous tools to work with tabular data like you'd find in spreadsheets or databases. It is widely used for data preparation, cleaning, and analysis. It can work with a wide variety of data and provides many visualization options. It is built on top of NumPy.

## Read Data from a CSV

In [None]:
def get_df_from_csv(filename):
    try:
        df = pd.read_csv(filename, index_col='Date', parse_dates=True)
    except FileNotFoundError:
        pass
        print("File Does not Exist")
    else:
        print(f"The {filename} File was read successfully")
        return df

In [None]:
#ticker = "MSFT2"
#df = pd.read_csv(ticker + '.csv', index_col='Date', 
#                         parse_dates=True)

In [None]:
csv_file = 'MSFT.csv'
msft_df = get_df_from_csv(csv_file)
msft_df

In [None]:
## Pandas Dataframe attributes
msft_df_attributes = [x  for x in (dir(msft_df)) if x[0] != '_' ]
print(msft_df_attributes)

In [None]:
# Getting an idea of the structure of the dataframe
msft_df.info()

In [None]:
# int8 / uint8 : consumes 1 byte of memory, range between -128/127 or 0/255

# bool : consumes 1 byte, true or false

# float16 / int16 / uint16: consumes 2 bytes of memory, range between -32,768 and 32,767 or 0/65,535

# float32 / int32 / uint32 : consumes 4 bytes of memory, range between -2,147,483,648 and 2,147,483,647

# float64 / int64 / uint64: consumes 8 bytes of memory

In [None]:
# Getting all the available columns
msft_df.columns

In [None]:
# Summary of your dataframe
msft_df.describe()

In [None]:
# The float64 is too big, reduce it to float16, there are float16, float32, float64
# Changing all the columns with the object data type to string
number_columns = ['Adj Close', 'daily_return']
for column in number_columns:
    msft_df[column] = msft_df[column].astype ("float32")
    
msft_df.info()

In [None]:
msft_df.head(5) # Returns the first five rows

In [None]:
# Evaluating number of NaN values in our dataframe
msft_df.isna().sum()

In [None]:
# Removing the row containing the NaN values
msft_df = msft_df.dropna(axis=0) # Drop the rows containing NaN values, x=1 is for column
msft_df

In [None]:
# Getting a summary of the data
msft_df.describe()

## Read Data from Excel

In [None]:
def get_df_from_excel(file):
    try:
        df = pd.read_excel(file)
    except FileNotFoundError:
        print("File Doesn't Exist")
    else:
        print(f"The {file} Excel File was read successfully")
        return df

In [None]:
# You may have to run this in the Qt Console : pip install openpyxl
file = "Wilshire-5000-Stocks.xlsx"
w_stocks = get_df_from_excel(file)
w_stocks

In [None]:
# Getting an idea of the structure of the dataframe
w_stocks.info()

In [None]:
# Getting all the available columns
w_stocks.columns

In [None]:
# Changing all the columns with the object data type to string
number_columns = ['Ticker', 'Company']
for column in number_columns:
    w_stocks[column] = w_stocks[column].astype ("string")
    
w_stocks.info()

In [None]:
# Getting a summary of the data
w_stocks.describe()

## Read Data from HTML

In [None]:
gov_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_current_United_States_governors")
gov_data

In [None]:
# We can define that we want the 2nd table on the page
gov_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_current_United_States_governors")[1]
gov_data.head(5)

In [None]:
# Delete a level of a multilevel column name
gov_data.columns = gov_data.columns.droplevel()
gov_data.head(2)

In [None]:
gov_data.head(2)

In [None]:
#List all the available columns
gov_data.columns

In [None]:
# Delete all the columns containining only "NaN" and "List" Value
gov_data = gov_data.drop(['Image', 'Party', 'Unnamed: 9_level_1','Unnamed: 10_level_1'], axis=1)
gov_data.head(5)

## Rename the "Party.1" column

In [None]:
# You could add additional with commas between {}
gov_data = gov_data.rename(columns={'Party.1': 'Party'})

gov_data.head(5)

In [None]:
gov_data[:5]

## Remove Characters in Columns

In [None]:
# Remove parentheses and whats inside them
gov_data['End of term'] = gov_data['End of term'].str.replace(r"\(.*\)","", regex=True) 
# replaces any opening and closing bracket and everything inside with nothing("")
gov_data[:5]

In [None]:
# Extracting All the entire columns headers
print(list(gov_data.columns))

In [None]:
# Removing parentheses and whats inside them for all the Columns
df_headers = ['State', 'Governor', 'Party', 'Born', 'Prior public experience', 'Inauguration', 'End of term']

for header in df_headers:
    gov_data[header] = gov_data[header].str.replace(r"\(.*\)","", regex=True) 
# replaces any opening and closing bracket and everything inside with nothing("")
gov_data[:5]

In [None]:
# Remove parentheses and the age text inside the 'Born' column


In [None]:
gov_data[31:48:16]

In [None]:
# Remove square parentheses and whats inside them
gov_data['End of term'] = gov_data['End of term'].str.replace(r"\[.*\]","", regex=True) 
# replaces any opening and closing bracket and everything inside with nothing("")
gov_data[31:48:16]

In [None]:
# Removing square parentheses and whats inside them for all the Columns
df_headers = ['State', 'Governor', 'Party', 'Born', 'Prior public experience', 'Inauguration', 'End of term']

for header in df_headers:
    gov_data[header] = gov_data[header].str.replace(r"\[.*\]","", regex=True) 
# replaces any opening and closing bracket and everything inside with nothing("")

gov_data[31:48:16]

In [None]:
# "^–*.\w$

In [None]:
# Investigating the Party Column in line 22
gov_data[22:23]

In [None]:
# Remove the dash(-) symbol parentheses and what follows in line 22 and replace with nothing
gov_data ['Party'] = gov_data ['Party'].astype (str)
gov_data['Party'] = gov_data['Party'].str.replace(r"–Farmer–Labor","") 
# replaces any opening and closing bracket and everything inside with nothing("")
gov_data[22:23]

In [None]:
# Investigating the Governor Column in line 3 and 30
gov_data[3:31:27]

In [None]:
# Replacing the middle name  of the two Governors with initials
gov_data['Governor'] = gov_data['Governor'].str.replace(r"Sarah Huckabee Sanders","Sarah H. Sanders") 
gov_data['Governor'] = gov_data['Governor'].str.replace(r"Michelle Lujan Grisham","Michelle L. Grisham")
gov_data[3:31:27]

In [None]:
# Confirming the structure of our dataframe
gov_data.info()

In [None]:
# Confirming the number of columns
print(f"The number of columns are: {len(gov_data.columns)}")
gov_data.columns

In [None]:
# Changing all the columns with the object data type to string
number_columns = ['State', 'Governor', 'Party', 'Born', 'Prior public experience']
for column in number_columns:
    gov_data[column] = gov_data[column].astype ("string")
    
gov_data.info()

In [None]:
# Changing the columns with the object data type to date
date_columns = ['Born','Inauguration']
for column in date_columns:
    gov_data[column] = pd.to_datetime(gov_data[column],format="mixed")
    
gov_data.info()

In [None]:
number_columns = ['End of term']
for column in number_columns:
    gov_data[column] = gov_data[column].astype ('int16') # int16 ranges from -32,686 to 32,687
    
gov_data.info()

In [None]:
gov_data[0:5]

In [None]:
# Formatting  the datetime in the Born and Inauguration columns into Nigeria format
# Changing the columns with the object data type to date
date_columns = ['Born','Inauguration']
for column in date_columns:
    gov_data[column] = gov_data[column].dt.strftime('%d-%b-%Y') # e.g 10-Jan-2023

gov_data[0:5]

In [None]:
# Remove square parentheses and whats inside them
gov_data['State'] = gov_data['State'].str.replace(r"\(.*\)","", regex=True) 
# replaces any opening and closing bracket and everything inside with nothing("")
gov_data

In [None]:
# Remove square parentheses and whats inside them
gov_data['Born'] = gov_data['Born'].str.replace(r"\(.*\)","", regex=True) 
# replaces any opening and closing bracket and everything inside with nothing("")
gov_data

In [None]:
# Delete all the columns containining only "NaN" and "List" Value
gov_data = gov_data.drop(['Prior public experience'], axis=1)
gov_data.head(5)

In [None]:
# Confirming that the right data are in few columns

In [None]:
# Checking the State Column 
print(list(gov_data['State'].unique()))

In [None]:
# Checking the State Column 
print(list(gov_data['Party'].unique()))

In [None]:
gov_data['Party'] = gov_data['Party'].str.replace(r" ","") 
print(list(gov_data['Party'].unique()))

In [None]:
# Checking the State Column 
print(list(gov_data['End of term'].unique()))

In [None]:
# Checking Final Summary before Exporting
gov_data.head(5)

In [None]:
# Exporting our gov_data dataframe as CSV File
try:
    gov_data.to_excel("US_Governors_List_2023_v3.xlsx",index=False)
except:
    print("Error: Unable to convert dataframe to a excel file")
else:
    print("The Excel file successfully created.")


In [None]:
pwd

## US Demographics

In [None]:
# You can also search for phrases in the table
d_data = pd.read_html("https://en.wikipedia.org/wiki/Demographics_of_the_United_States", 
                      match="Average population")
d_data

In [2]:
# You can also search for phrases in the table
d_data = pd.read_html("https://en.wikipedia.org/wiki/Demographics_of_the_United_States", 
                      match="Average population")[0]
d_data

Unnamed: 0.1,Unnamed: 0,Average population,Live births[92],Deaths,Natural change,"Crude birth rate (per 1,000)","Crude death rate (per 1,000)[93]","Natural change (per 1,000)","Crude migration change (per 1,000)",Total fertility rate[fn 1][94]
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,,2.19
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,-0.5,2.15
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,-1.5,2.17
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,-0.9,2.22
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,-0.1,2.17
...,...,...,...,...,...,...,...,...,...,...
83,2018[97][98],326838199,3791712,2839205,952507,11.6,8.7,2.9,2.4,1.729
84,2019,328329953,3747540,2854858,892682,11.4,8.7,2.7,1.8,1.706
85,2020[99][100],331501080,3613647,3383729,229918,10.9,10.3,0.6,8.9,1.641
86,2021[101][102],331893745,3664292,3464231,200061,11.0,10.4,0.5,0.7,1.664


## Replace Spaces in Column Names

In [3]:
d_data.columns = [x.replace(' ', '_') for x in d_data.columns]
d_data

Unnamed: 0,Unnamed:_0,Average_population,Live_births[92],Deaths,Natural_change,"Crude_birth_rate_(per_1,000)","Crude_death_rate_(per_1,000)[93]","Natural_change_(per_1,000)","Crude_migration_change_(per_1,000)",Total_fertility_rate[fn_1][94]
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,,2.19
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,-0.5,2.15
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,-1.5,2.17
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,-0.9,2.22
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,-0.1,2.17
...,...,...,...,...,...,...,...,...,...,...
83,2018[97][98],326838199,3791712,2839205,952507,11.6,8.7,2.9,2.4,1.729
84,2019,328329953,3747540,2854858,892682,11.4,8.7,2.7,1.8,1.706
85,2020[99][100],331501080,3613647,3383729,229918,10.9,10.3,0.6,8.9,1.641
86,2021[101][102],331893745,3664292,3464231,200061,11.0,10.4,0.5,0.7,1.664


## Remove Characters in Columns

In [4]:
# Remove parentheses and whats inside them
d_data.columns = d_data.columns.str.replace(r"\(.*\)","", regex=True) 
# replaces any opening and closing bracket and everything inside with nothing("")
d_data.head(5)

Unnamed: 0,Unnamed:_0,Average_population,Live_births[92],Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_[93],Natural_change_,Crude_migration_change_,Total_fertility_rate[fn_1][94]
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,,2.19
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,-0.5,2.15
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,-1.5,2.17
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,-0.9,2.22
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,-0.1,2.17


In [5]:
# Remove the square brackets and whatever is inside them
d_data.columns = d_data.columns.str.replace(r"\[.*\]","", regex=True)
# replaces any opening and closing square bracket and everything inside with nothing("")
d_data.head(5)

Unnamed: 0,Unnamed:_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_,Natural_change_,Crude_migration_change_,Total_fertility_rate
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,,2.19
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,-0.5,2.15
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,-1.5,2.17
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,-0.9,2.22
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,-0.1,2.17


## Rename Columns

In [6]:
# You could add additional with commas between {}
d_data = d_data.rename(columns={'Unnamed:_0': 'Year'})

d_data

Unnamed: 0,Year,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate_,Crude_death_rate_,Natural_change_,Crude_migration_change_,Total_fertility_rate
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,,2.19
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,-0.5,2.15
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,-1.5,2.17
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,-0.9,2.22
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,-0.1,2.17
...,...,...,...,...,...,...,...,...,...,...
83,2018[97][98],326838199,3791712,2839205,952507,11.6,8.7,2.9,2.4,1.729
84,2019,328329953,3747540,2854858,892682,11.4,8.7,2.7,1.8,1.706
85,2020[99][100],331501080,3613647,3383729,229918,10.9,10.3,0.6,8.9,1.641
86,2021[101][102],331893745,3664292,3464231,200061,11.0,10.4,0.5,0.7,1.664


## Remove any trailing underscores(_) from column names

In [7]:
d_data.columns = d_data.columns.str.replace(r"\_$","", regex=True) # ^_ for begining of column names with an _
# replaces any trailing underscores in the column names and replace with nothing("")
d_data

Unnamed: 0,Year,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Natural_change.1,Crude_migration_change,Total_fertility_rate
0,1935,127250000,2377000,1392752,984248,18.7,10.9,7.7,,2.19
1,1936,128053000,2355000,1479228,875772,18.4,11.5,6.8,-0.5,2.15
2,1937,128825000,2413000,1450427,962573,18.7,11.2,7.5,-1.5,2.17
3,1938,129825000,2496000,1381391,1114609,19.2,10.6,8.6,-0.9,2.22
4,1939,130880000,2466000,1387897,1078103,18.8,10.6,8.2,-0.1,2.17
...,...,...,...,...,...,...,...,...,...,...
83,2018[97][98],326838199,3791712,2839205,952507,11.6,8.7,2.9,2.4,1.729
84,2019,328329953,3747540,2854858,892682,11.4,8.7,2.7,1.8,1.706
85,2020[99][100],331501080,3613647,3383729,229918,10.9,10.3,0.6,8.9,1.641
86,2021[101][102],331893745,3664292,3464231,200061,11.0,10.4,0.5,0.7,1.664


In [8]:
d_data.columns

Index(['Year', 'Average_population', 'Live_births', 'Deaths', 'Natural_change',
       'Crude_birth_rate', 'Crude_death_rate', 'Natural_change',
       'Crude_migration_change', 'Total_fertility_rate'],
      dtype='object')

In [16]:
tuple(d_data)

('Year',
 'Average_population',
 'Live_births',
 'Deaths',
 'Natural_change',
 'Crude_birth_rate',
 'Crude_death_rate',
 'Natural_change',
 'Crude_migration_change',
 'Total_fertility_rate')

In [17]:
# Checking the summary of the table
d_data.describe()

Unnamed: 0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Natural_change.1,Crude_migration_change
count,88.0,88.0,88.0,88.0,88.0,88.0,88.0,87.0
mean,225685000.0,3696468.0,2029804.0,1667921.0,17.411364,9.167045,8.293182,2.67931
std,64857220.0,491684.6,488245.2,522764.9,4.357935,0.827504,4.016855,5.927981
min,127250000.0,2355000.0,1381391.0,200061.0,10.9,7.9,0.5,-22.6
25%,169536500.0,3547217.0,1615965.0,1394533.0,14.275,8.575,5.775,1.8
50%,223331000.0,3820000.0,1968474.0,1644381.0,15.8,8.85,7.1,3.0
75%,282864000.0,4030857.0,2399049.0,1902447.0,20.575,9.6,11.175,3.85
max,333287600.0,4316234.0,3464231.0,2666872.0,26.6,11.5,16.5,41.0


In [18]:
# Extracting the data information for the dataframe
d_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    88 non-null     object 
 1   Average_population      88 non-null     int64  
 2   Live_births             88 non-null     int64  
 3   Deaths                  88 non-null     int64  
 4   Natural_change          88 non-null     int64  
 5   Crude_birth_rate        88 non-null     float64
 6   Crude_death_rate        88 non-null     float64
 7   Natural_change          88 non-null     float64
 8   Crude_migration_change  87 non-null     float64
 9   Total_fertility_rate    88 non-null     object 
dtypes: float64(4), int64(4), object(2)
memory usage: 7.0+ KB


In [19]:
# Selecting column headers for integers
int_data_columns = list(d_data.select_dtypes(include='int').columns) # number, float, bool, object, datetime
print(int_data_columns)

['Average_population', 'Live_births', 'Deaths', 'Natural_change']


In [20]:
for column in int_data_columns:
    d_data[column] = d_data[column].astype ('int32') # int32 ranges from -2,147,483,648 to 2,147,483,647

d_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    88 non-null     object 
 1   Average_population      88 non-null     int32  
 2   Live_births             88 non-null     int32  
 3   Deaths                  88 non-null     int32  
 4   Natural_change          88 non-null     int32  
 5   Crude_birth_rate        88 non-null     float64
 6   Crude_death_rate        88 non-null     float64
 7   Natural_change          88 non-null     int32  
 8   Crude_migration_change  87 non-null     float64
 9   Total_fertility_rate    88 non-null     object 
dtypes: float64(3), int32(5), object(2)
memory usage: 5.3+ KB


In [21]:
# Selecting column headers for float
float_data_columns = list(d_data.select_dtypes(include='float').columns)
print(float_data_columns)

['Crude_birth_rate', 'Crude_death_rate', 'Crude_migration_change']


In [22]:
for column in float_data_columns:
    d_data[column] = d_data[column].astype ('float16') #

d_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    88 non-null     object 
 1   Average_population      88 non-null     int32  
 2   Live_births             88 non-null     int32  
 3   Deaths                  88 non-null     int32  
 4   Natural_change          88 non-null     int32  
 5   Crude_birth_rate        88 non-null     float16
 6   Crude_death_rate        88 non-null     float16
 7   Natural_change          88 non-null     int32  
 8   Crude_migration_change  87 non-null     float16
 9   Total_fertility_rate    88 non-null     object 
dtypes: float16(3), int32(5), object(2)
memory usage: 3.7+ KB


In [23]:
# Selecting column headers for objects
object_data_columns = list(d_data.select_dtypes(include='object').columns)
print(object_data_columns)

['Year', 'Total_fertility_rate']


In [24]:
#Investigating the unique values for year column
unique_years = d_data.Year.unique()
print(unique_years)

['1935' '1936' '1937' '1938' '1939' '1940' '1941' '1942' '1943' '1944'
 '1945' '1946' '1947' '1948' '1949' '1950' '1951' '1952' '1953' '1954'
 '1955' '1956' '1957' '1958' '1959' '1960' '1961' '1962' '1963' '1964'
 '1965' '1966' '1967' '1968' '1969' '1970' '1971' '1972' '1973' '1974'
 '1975' '1976' '1977' '1978' '1979' '1980' '1981' '1982' '1983' '1984'
 '1985' '1986' '1987' '1988' '1989' '1990' '1991' '1992' '1993' '1994'
 '1995' '1996' '1997' '1998' '1999' '2000' '2001' '2002' '2003' '2004'
 '2005' '2006' '2007' '2008' '2009' '2010' '2011' '2012' '2013' '2014'
 '2015' '2016' '2017[95][96]' '2018[97][98]' '2019' '2020[99][100]'
 '2021[101][102]' '2022[103][104]']


In [25]:
# Cleaning up the square brackets and changing to the appropriate datatype
# Removing square parentheses and whats inside them for all the Columns
df_headers = ['Year']
for header in df_headers:
    d_data[header] = d_data[header].str.replace(r"\[.*\]","", regex=True)  # remove the square paretheses
    d_data[header] = d_data[header].astype("int16") # convert it data between -32,000 to +32,000

unique_years = d_data.Year.unique()
print(unique_years)

[1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948
 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962
 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976
 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990
 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004
 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018
 2019 2020 2021 2022]


In [26]:
#Investigating the unique values for year column
unique_fertility_rate = d_data['Total_fertility_rate'].unique()
print(unique_fertility_rate)

['2.19' '2.15' '2.17' '2.22' '2.301' '2.399' '2.628' '2.718' '2.568'
 '2.491' '2.943' '3.274' '3.109' '3.110' '3.091' '3.269' '3.358' '3.424'
 '3.543' '3.580' '3.689' '3.767' '3.701' '3.670' '3.654' '3.629' '3.474'
 '3.333' '3.208' '2.928' '2.736' '2.578' '2.477' '2.465' '2.480' '2.266'
 '2.010' '1.879' '1.835' '1.774' '1.738' '1.789' '1.760' '1.808' '1.839'
 '1.812' '1.827' '1.799' '1.806' '1.844' '1.837' '1.872' '1.934' '2.014'
 '2.081' '2.062' '2.046' '2.019' '2.001' '1.978' '1.976' '1.971' '1.999'
 '2.007' '2.056' '2.030' '2.020' '2.047' '2.051' '2.057' '2.108' '2.120'
 '2.072' '2.002' '1.931' '1.894' '1.880' '1.857' '1.862' '1.843' '1.820'
 '1.765' '1.729' '1.706' '1.641' '1.664' '1.665[105]']


In [27]:
# Cleaning up the square brackets and changing to the appropriate datatype
# Removing square parentheses and whats inside them for all the Columns
df_headers = ['Total_fertility_rate']
for header in df_headers:
    d_data[header] = d_data[header].str.replace(r"\[.*\]","", regex=True)  # remove the square paretheses
    d_data[header] = d_data[header].astype("float16") # convert it data between -32,000 to +32,000

unique_fertility_rate = d_data['Total_fertility_rate'].unique()
print(unique_fertility_rate)

[2.19  2.15  2.17  2.22  2.3   2.398 2.629 2.719 2.568 2.49  2.943 3.273
 3.11  3.092 3.27  3.357 3.424 3.543 3.58  3.69  3.768 3.701 3.67  3.654
 3.629 3.475 3.332 3.207 2.928 2.736 2.578 2.477 2.465 2.48  2.266 2.01
 1.879 1.835 1.774 1.738 1.789 1.76  1.808 1.839 1.812 1.827 1.799 1.806
 1.844 1.837 1.872 1.934 2.014 2.08  2.062 2.047 2.02  2.002 1.978 1.976
 1.971 1.999 2.008 2.057 2.03  2.05  2.107 2.12  2.072 1.931 1.894 1.88
 1.857 1.862 1.843 1.82  1.765 1.729 1.706 1.641 1.664 1.665]


In [28]:
# Checking the summary again to establish the data are valid
d_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88 entries, 0 to 87
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Year                    88 non-null     int16  
 1   Average_population      88 non-null     int32  
 2   Live_births             88 non-null     int32  
 3   Deaths                  88 non-null     int32  
 4   Natural_change          88 non-null     int32  
 5   Crude_birth_rate        88 non-null     float16
 6   Crude_death_rate        88 non-null     float16
 7   Natural_change          88 non-null     int32  
 8   Crude_migration_change  87 non-null     float16
 9   Total_fertility_rate    88 non-null     float16
dtypes: float16(4), int16(1), int32(5)
memory usage: 2.7 KB


In [29]:
# Evaluating the summary of the dataframe
d_data.describe()

Unnamed: 0,Year,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Natural_change.1,Crude_migration_change,Total_fertility_rate
count,88.0,88.0,88.0,88.0,88.0,88.0,88.0,88.0,87.0,88.0
mean,1978.5,225685000.0,3696468.0,2029804.0,1667921.0,17.40625,9.164062,7.818182,2.679688,2.339844
std,25.547342,64857220.0,491684.6,488245.2,522764.9,4.355469,0.827148,4.064269,5.925781,0.629883
min,1935.0,127250000.0,2355000.0,1381391.0,200061.0,10.898438,7.898438,0.0,-22.59375,1.640625
25%,1956.75,169536500.0,3547217.0,1615965.0,1394533.0,14.273438,8.576172,5.0,1.799805,1.861084
50%,1978.5,223331000.0,3820000.0,1968474.0,1644381.0,15.796875,8.847656,7.0,3.0,2.053711
75%,2000.25,282864000.0,4030857.0,2399049.0,1902447.0,20.578125,9.601562,11.0,3.851562,2.651367
max,2022.0,333287600.0,4316234.0,3464231.0,2666872.0,26.59375,11.5,16.0,41.0,3.767578


In [30]:
# Version 1 for single colums with no space in the column name
d_data.Live_births

0     2377000
1     2355000
2     2413000
3     2496000
4     2466000
       ...   
83    3791712
84    3747540
85    3613647
86    3664292
87    3667758
Name: Live_births, Length: 88, dtype: int32

In [31]:
# Version 2 for single colums with space or no space in the column name
d_data['Deaths']

0     1392752
1     1479228
2     1450427
3     1381391
4     1387897
       ...   
83    2839205
84    2854858
85    3383729
86    3464231
87    3273705
Name: Deaths, Length: 88, dtype: int32

## Make a Column an Index

In [32]:
d_data.set_index('Year', inplace=True)
d_data

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Natural_change,Crude_migration_change,Total_fertility_rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1935,127250000,2377000,1392752,984248,18.703125,10.898438,7,,2.189453
1936,128053000,2355000,1479228,875772,18.406250,11.500000,6,-0.500000,2.150391
1937,128825000,2413000,1450427,962573,18.703125,11.203125,7,-1.500000,2.169922
1938,129825000,2496000,1381391,1114609,19.203125,10.601562,8,-0.899902,2.220703
1939,130880000,2466000,1387897,1078103,18.796875,10.601562,8,-0.099976,2.169922
...,...,...,...,...,...,...,...,...,...
2018,326838199,3791712,2839205,952507,11.601562,8.703125,2,2.400391,1.728516
2019,328329953,3747540,2854858,892682,11.398438,8.703125,2,1.799805,1.706055
2020,331501080,3613647,3383729,229918,10.898438,10.296875,0,8.898438,1.640625
2021,331893745,3664292,3464231,200061,11.000000,10.398438,0,0.700195,1.664062


## Grab Data from Multiple Columns

In [33]:
d_data[["Live_births", "Deaths"]] # Multiple columns must be in double square bracket

Unnamed: 0_level_0,Live_births,Deaths
Year,Unnamed: 1_level_1,Unnamed: 2_level_1
1935,2377000,1392752
1936,2355000,1479228
1937,2413000,1450427
1938,2496000,1381391
1939,2466000,1387897
...,...,...
2018,3791712,2839205
2019,3747540,2854858
2020,3613647,3383729
2021,3664292,3464231


## Grab a Row

In [34]:
# Version 1
d_data.loc[2020]

Average_population        3.315011e+08
Live_births               3.613647e+06
Deaths                    3.383729e+06
Natural_change            2.299180e+05
Crude_birth_rate          1.089844e+01
Crude_death_rate          1.029688e+01
Natural_change            0.000000e+00
Crude_migration_change    8.898438e+00
Total_fertility_rate      1.640625e+00
Name: 2020, dtype: float64

In [35]:
d_data.iloc[85] # row 86 is 2021 (the last), row 85 is 2020

Average_population        3.315011e+08
Live_births               3.613647e+06
Deaths                    3.383729e+06
Natural_change            2.299180e+05
Crude_birth_rate          1.089844e+01
Crude_death_rate          1.029688e+01
Natural_change            0.000000e+00
Crude_migration_change    8.898438e+00
Total_fertility_rate      1.640625e+00
Name: 2020, dtype: float64

In [91]:
d_data[86:86] # Extracting the row horizontally

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Crude_migration_change,Total_fertility_rate,Pop_Growth
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2020,331501080,3613647,3383729,229918,10.898438,10.296875,8.898438,1.640625,229918


In [37]:
d_data.tail(5)

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Natural_change,Crude_migration_change,Total_fertility_rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018,326838199,3791712,2839205,952507,11.601562,8.703125,2,2.400391,1.728516
2019,328329953,3747540,2854858,892682,11.398438,8.703125,2,1.799805,1.706055
2020,331501080,3613647,3383729,229918,10.898438,10.296875,0,8.898438,1.640625
2021,331893745,3664292,3464231,200061,11.0,10.398438,0,0.700195,1.664062
2022,333287557,3667758,3273705,394053,11.0,9.796875,1,3.0,1.665039


In [38]:
d_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 88 entries, 1935 to 2022
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Average_population      88 non-null     int32  
 1   Live_births             88 non-null     int32  
 2   Deaths                  88 non-null     int32  
 3   Natural_change          88 non-null     int32  
 4   Crude_birth_rate        88 non-null     float16
 5   Crude_death_rate        88 non-null     float16
 6   Natural_change          88 non-null     int32  
 7   Crude_migration_change  87 non-null     float16
 8   Total_fertility_rate    88 non-null     float16
dtypes: float16(4), int32(5)
memory usage: 3.9 KB


In [39]:
d_data.columns

Index(['Average_population', 'Live_births', 'Deaths', 'Natural_change',
       'Crude_birth_rate', 'Crude_death_rate', 'Natural_change',
       'Crude_migration_change', 'Total_fertility_rate'],
      dtype='object')

In [40]:
d_data.head(5)

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Natural_change,Crude_migration_change,Total_fertility_rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1935,127250000,2377000,1392752,984248,18.703125,10.898438,7,,2.189453
1936,128053000,2355000,1479228,875772,18.40625,11.5,6,-0.5,2.150391
1937,128825000,2413000,1450427,962573,18.703125,11.203125,7,-1.5,2.169922
1938,129825000,2496000,1381391,1114609,19.203125,10.601562,8,-0.899902,2.220703
1939,130880000,2466000,1387897,1078103,18.796875,10.601562,8,-0.099976,2.169922


In [42]:
# Keep only columns if they haven't used the same name prior.
# The code removes second columns with duplicated headings
d_data = d_data.loc[:,~d_data.columns.duplicated()] # ~ tilde
d_data.head(5)

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Crude_migration_change,Total_fertility_rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1935,127250000,2377000,1392752,984248,18.703125,10.898438,,2.189453
1936,128053000,2355000,1479228,875772,18.40625,11.5,-0.5,2.150391
1937,128825000,2413000,1450427,962573,18.703125,11.203125,-1.5,2.169922
1938,129825000,2496000,1381391,1114609,19.203125,10.601562,-0.899902,2.220703
1939,130880000,2466000,1387897,1078103,18.796875,10.601562,-0.099976,2.169922


In [103]:
# Specific columns can selected based on the column index from index 0 to the last index
d_data.iloc[0:,[0,2,7]]

Unnamed: 0_level_0,Average_population,Deaths,Total_fertility_rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1936,128053000,1479228,2.150391
1937,128825000,1450427,2.169922
1938,129825000,1381391,2.220703
1939,130880000,1387897,2.169922
1940,131930000,1417269,2.300781
...,...,...,...
2018,326838199,2839205,1.728516
2019,328329953,2854858,1.706055
2020,331501080,3383729,1.640625
2021,331893745,3464231,1.664062


In [46]:
d_data.head(5)

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Crude_migration_change,Total_fertility_rate
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1935,127250000,2377000,1392752,984248,18.703125,10.898438,,2.189453
1936,128053000,2355000,1479228,875772,18.40625,11.5,-0.5,2.150391
1937,128825000,2413000,1450427,962573,18.703125,11.203125,-1.5,2.169922
1938,129825000,2496000,1381391,1114609,19.203125,10.601562,-0.899902,2.220703
1939,130880000,2466000,1387897,1078103,18.796875,10.601562,-0.099976,2.169922


In [47]:
d_data.to_excel("For inspection.xlsx")
print("Dataframe converted to Excel file successfully..")

Dataframe converted to Excel file successfully..


In [48]:
# Create a column showing population growth for each year
d_data['Pop_Growth'] = d_data["Live_births"] - d_data["Deaths"]
d_data

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Crude_migration_change,Total_fertility_rate,Pop_Growth
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1935,127250000,2377000,1392752,984248,18.703125,10.898438,,2.189453,984248
1936,128053000,2355000,1479228,875772,18.406250,11.500000,-0.500000,2.150391,875772
1937,128825000,2413000,1450427,962573,18.703125,11.203125,-1.500000,2.169922,962573
1938,129825000,2496000,1381391,1114609,19.203125,10.601562,-0.899902,2.220703,1114609
1939,130880000,2466000,1387897,1078103,18.796875,10.601562,-0.099976,2.169922,1078103
...,...,...,...,...,...,...,...,...,...
2018,326838199,3791712,2839205,952507,11.601562,8.703125,2.400391,1.728516,952507
2019,328329953,3747540,2854858,892682,11.398438,8.703125,1.799805,1.706055,892682
2020,331501080,3613647,3383729,229918,10.898438,10.296875,8.898438,1.640625,229918
2021,331893745,3664292,3464231,200061,11.000000,10.398438,0.700195,1.664062,200061


## Adding another Dummy Column

In [49]:
# Create a column showing population growth for each year
d_data['Dummy_column'] = d_data["Crude_death_rate"] - d_data["Deaths"]
d_data[:5]

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Crude_migration_change,Total_fertility_rate,Pop_Growth,Dummy_column
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1935,127250000,2377000,1392752,984248,18.703125,10.898438,,2.189453,984248,-1392741.0
1936,128053000,2355000,1479228,875772,18.40625,11.5,-0.5,2.150391,875772,-1479216.0
1937,128825000,2413000,1450427,962573,18.703125,11.203125,-1.5,2.169922,962573,-1450416.0
1938,129825000,2496000,1381391,1114609,19.203125,10.601562,-0.899902,2.220703,1114609,-1381380.0
1939,130880000,2466000,1387897,1078103,18.796875,10.601562,-0.099976,2.169922,1078103,-1387886.0


## Delete Column

In [50]:
d_data.drop('Dummy_column', axis=1, inplace=True) # axis = 1 is for column, axis = 0 is for row
d_data

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Crude_migration_change,Total_fertility_rate,Pop_Growth
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1935,127250000,2377000,1392752,984248,18.703125,10.898438,,2.189453,984248
1936,128053000,2355000,1479228,875772,18.406250,11.500000,-0.500000,2.150391,875772
1937,128825000,2413000,1450427,962573,18.703125,11.203125,-1.500000,2.169922,962573
1938,129825000,2496000,1381391,1114609,19.203125,10.601562,-0.899902,2.220703,1114609
1939,130880000,2466000,1387897,1078103,18.796875,10.601562,-0.099976,2.169922,1078103
...,...,...,...,...,...,...,...,...,...
2018,326838199,3791712,2839205,952507,11.601562,8.703125,2.400391,1.728516,952507
2019,328329953,3747540,2854858,892682,11.398438,8.703125,1.799805,1.706055,892682
2020,331501080,3613647,3383729,229918,10.898438,10.296875,8.898438,1.640625,229918
2021,331893745,3664292,3464231,200061,11.000000,10.398438,0.700195,1.664062,200061


## Delete a Row

In [51]:
d_data.drop(1935, axis=0, inplace=True) # x=0 is for row, x=1 is for column
d_data[:10]

Unnamed: 0_level_0,Average_population,Live_births,Deaths,Natural_change,Crude_birth_rate,Crude_death_rate,Crude_migration_change,Total_fertility_rate,Pop_Growth
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1936,128053000,2355000,1479228,875772,18.40625,11.5,-0.5,2.150391,875772
1937,128825000,2413000,1450427,962573,18.703125,11.203125,-1.5,2.169922,962573
1938,129825000,2496000,1381391,1114609,19.203125,10.601562,-0.899902,2.220703,1114609
1939,130880000,2466000,1387897,1078103,18.796875,10.601562,-0.099976,2.169922,1078103
1940,131930000,2559000,1417269,1142000,19.40625,10.796875,-0.700195,2.300781,1141731
1941,133058000,2703000,1397642,1305358,20.296875,10.5,-1.299805,2.398438,1305358
1942,133752000,2989000,1385187,1603813,22.203125,10.296875,-6.800781,2.628906,1603813
1943,133971000,3104000,1459544,1644306,22.796875,10.703125,-10.703125,2.71875,1644456
1944,132622000,2939000,1411338,1644456,21.203125,10.203125,-22.59375,2.568359,1527662
1945,132137000,2858000,1401719,1456281,20.40625,11.0,-14.703125,2.490234,1456281


## List of Datatypes and value ranges

In [None]:
# int8 / uint8 : consumes 1 byte of memory, range between -128/127 or 0/255

# bool : consumes 1 byte, true or false

# float16 / int16 / uint16: consumes 2 bytes of memory, range between -32,768 and 32,767 or 0/65,535

# float32 / int32 / uint32 : consumes 4 bytes of memory, range between -2,147,483,648 and 2,147,483,647

# float64 / int64 / uint64: consumes 8 bytes of memory

In [52]:
# Confirming the number of columns
print(f"The number of columns are: {len(d_data.columns)}")
d_data.columns

The number of columns are: 9


Index(['Average_population', 'Live_births', 'Deaths', 'Natural_change',
       'Crude_birth_rate', 'Crude_death_rate', 'Crude_migration_change',
       'Total_fertility_rate', 'Pop_Growth'],
      dtype='object')

In [53]:
# Confirming that all data are the right format
    
d_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87 entries, 1936 to 2022
Data columns (total 9 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Average_population      87 non-null     int32  
 1   Live_births             87 non-null     int32  
 2   Deaths                  87 non-null     int32  
 3   Natural_change          87 non-null     int32  
 4   Crude_birth_rate        87 non-null     float16
 5   Crude_death_rate        87 non-null     float16
 6   Crude_migration_change  87 non-null     float16
 7   Total_fertility_rate    87 non-null     float16
 8   Pop_Growth              87 non-null     int32  
dtypes: float16(4), int32(5)
memory usage: 2.5 KB


In [54]:
# Exporting our gov_data dataframe as MS Excel File
try:
    d_data.to_excel("US_Population_data.xlsx")
except:
    print("Error: Unable to convert dataframe to a csv file")
else:
    print("The Excel file successfully created.")


The Excel file successfully created.


## Manipulating Data

In [55]:
c_data = pd.read_html("https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)")[2]
c_data

Unnamed: 0_level_0,Country/Territory,UN region,IMF[1][13],IMF[1][13],World Bank[14],World Bank[14],United Nations[15],United Nations[15]
Unnamed: 0_level_1,Country/Territory,UN region,Forecast,Year,Estimate,Year,Estimate,Year
0,World,—,104476432,2023,100562011,2022,96698005,2021
1,United States,Americas,26949643,2023,25462700,2022,23315081,2021
2,China,Asia,17700899,[n 1]2023,17963171,[n 3]2022,17734131,[n 1]2021
3,Germany,Europe,4429838,2023,4072192,2022,4259935,2021
4,Japan,Asia,4230862,2023,4231141,2022,4940878,2021
...,...,...,...,...,...,...,...,...
209,Palau,Oceania,267,2023,—,—,218,2021
210,Kiribati,Oceania,246,2023,223,2022,227,2021
211,Nauru,Oceania,150,2023,151,2022,155,2021
212,Montserrat,Americas,—,—,—,—,72,2021


In [56]:
for col in c_data.columns:
    print(col)

('Country/Territory', 'Country/Territory')
('UN region', 'UN region')
('IMF[1][13]', 'Forecast')
('IMF[1][13]', 'Year')
('World Bank[14]', 'Estimate')
('World Bank[14]', 'Year')
('United Nations[15]', 'Estimate')
('United Nations[15]', 'Year')


In [57]:
# Delete a level of a multilevel column name
c_data.columns = c_data.columns.droplevel()
c_data

Unnamed: 0,Country/Territory,UN region,Forecast,Year,Estimate,Year.1,Estimate.1,Year.2
0,World,—,104476432,2023,100562011,2022,96698005,2021
1,United States,Americas,26949643,2023,25462700,2022,23315081,2021
2,China,Asia,17700899,[n 1]2023,17963171,[n 3]2022,17734131,[n 1]2021
3,Germany,Europe,4429838,2023,4072192,2022,4259935,2021
4,Japan,Asia,4230862,2023,4231141,2022,4940878,2021
...,...,...,...,...,...,...,...,...
209,Palau,Oceania,267,2023,—,—,218,2021
210,Kiribati,Oceania,246,2023,223,2022,227,2021
211,Nauru,Oceania,150,2023,151,2022,155,2021
212,Montserrat,Americas,—,—,—,—,72,2021


In [58]:
# Keep only columns if they haven't used the same name prior
# This codes removes "Estimate" column in column 7 and "Year" columns in column 6 and 8
c_data = c_data.loc[:,~c_data.columns.duplicated()]
c_data

Unnamed: 0,Country/Territory,UN region,Forecast,Year,Estimate
0,World,—,104476432,2023,100562011
1,United States,Americas,26949643,2023,25462700
2,China,Asia,17700899,[n 1]2023,17963171
3,Germany,Europe,4429838,2023,4072192
4,Japan,Asia,4230862,2023,4231141
...,...,...,...,...,...
209,Palau,Oceania,267,2023,—
210,Kiribati,Oceania,246,2023,223
211,Nauru,Oceania,150,2023,151
212,Montserrat,Americas,—,—,—


In [59]:
# Counting how many rows with NaN values by taking only rows that don't contain NaNs
c_data.isna().sum()

Country/Territory    0
UN region            0
Forecast             0
Year                 0
Estimate             0
dtype: int64

In [60]:
c_data[:5]

Unnamed: 0,Country/Territory,UN region,Forecast,Year,Estimate
0,World,—,104476432,2023,100562011
1,United States,Americas,26949643,2023,25462700
2,China,Asia,17700899,[n 1]2023,17963171
3,Germany,Europe,4429838,2023,4072192
4,Japan,Asia,4230862,2023,4231141


In [61]:
# Checking that all the columns have valid data
c_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Country/Territory  214 non-null    object
 1   UN region          214 non-null    object
 2   Forecast           214 non-null    object
 3   Year               214 non-null    object
 4   Estimate           214 non-null    object
dtypes: object(5)
memory usage: 8.5+ KB


In [62]:
#Investigating the unique values for year column
unique_years = c_data.Year.unique()
print(unique_years)

['2023' '[n 1]2023' '[n 4]2023' '[n 5]2023' '[n 6]2023' '—' '[n 9]2023'
 '[n 10]2023' '[n 11]2023' '[n 12]2021' '[n 14]2023']


In [63]:
# You can also investigating the unique values for year column using set data structure
unique_years_v2 = set(c_data.Year)
print(unique_years_v2)

{'[n 6]2023', '[n 10]2023', '[n 14]2023', '—', '[n 11]2023', '2023', '[n 1]2023', '[n 9]2023', '[n 4]2023', '[n 12]2021', '[n 5]2023'}


In [64]:
# Remove []s and what is in them in Year column
# Removes brackets and what is inside for whole column
c_data.Year = c_data.Year.str.replace(r"\[.*\]","", regex=True)
unique_years_v2 = set(c_data.Year)
print(unique_years_v2)

{'2021', '2023', '—'}


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_data.Year = c_data.Year.str.replace(r"\[.*\]","", regex=True)


In [65]:
#Deleting rows in GDP column containing the special character (—)
c_data = c_data[~c_data.Year.str.contains("—", regex=False)]
unique_years_v2 = set(c_data.Year)
print(unique_years_v2)

{'2021', '2023'}


In [66]:
# Rename country column
c_data.rename(columns={"Country/Territory": "Country", "Estimate": "GDP"}, 
              inplace=True)
c_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_data.rename(columns={"Country/Territory": "Country", "Estimate": "GDP"},


Unnamed: 0,Country,UN region,Forecast,Year,GDP
0,World,—,104476432,2023,100562011
1,United States,Americas,26949643,2023,25462700
2,China,Asia,17700899,2023,17963171
3,Germany,Europe,4429838,2023,4072192
4,Japan,Asia,4230862,2023,4231141
...,...,...,...,...,...
208,Marshall Islands,Oceania,277,2023,280
209,Palau,Oceania,267,2023,—
210,Kiribati,Oceania,246,2023,223
211,Nauru,Oceania,150,2023,151


In [67]:
# You can also investigating the unique values for Country column using set data structure
unique_country_v2 = set(c_data.Country)
print(unique_country_v2)

{'Egypt', 'Indonesia', 'Turkey', 'Mozambique', 'Dominica', 'Cyprus', 'Greece', 'Switzerland', 'Kazakhstan', 'Moldova', 'Malta', 'Eswatini', 'Pakistan', 'Brunei', 'Gambia', 'Laos', 'Trinidad and Tobago', 'Aruba', 'Papua New Guinea', 'Australia', 'Dominican Republic', 'Belarus', 'Djibouti', 'Tanzania', 'South Sudan', 'Denmark', 'Uzbekistan', 'Latvia', 'Malawi', 'Saudi Arabia', 'Iceland', 'Hong Kong', 'Jordan', 'Honduras', 'Mongolia', 'Tunisia', 'Spain', 'Senegal', 'Comoros', 'Ecuador', 'South Korea', 'Kuwait', 'Mali', 'World', 'Panama', 'China', 'Puerto Rico', 'Ethiopia', 'Turkmenistan', 'Ireland', 'Kiribati', 'United Arab Emirates', 'Bahamas', 'Gabon', 'Paraguay', 'Slovakia', 'El Salvador', 'Bahrain', 'Micronesia', 'Iran', 'Saint Kitts and Nevis', 'Madagascar', 'Myanmar', 'San Marino', 'Zimbabwe', 'Nepal', 'Canada', 'Albania', 'Cape Verde', 'United States', 'Colombia', 'Slovenia', 'Bhutan', 'Iraq', 'Burkina Faso', 'Azerbaijan', 'Nauru', 'Suriname', 'Central African Republic', 'Kyrgyzsta

In [68]:
# Remove * in Country column
c_data.Country = c_data.Country.str.replace("*","",)
c_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_data.Country = c_data.Country.str.replace("*","")


Unnamed: 0,Country,UN region,Forecast,Year,GDP
0,World,—,104476432,2023,100562011
1,United States,Americas,26949643,2023,25462700
2,China,Asia,17700899,2023,17963171
3,Germany,Europe,4429838,2023,4072192
4,Japan,Asia,4230862,2023,4231141
...,...,...,...,...,...
208,Marshall Islands,Oceania,277,2023,280
209,Palau,Oceania,267,2023,—
210,Kiribati,Oceania,246,2023,223
211,Nauru,Oceania,150,2023,151


In [None]:
# Install the openpyxl package to handle excel files properly
!pip install openpyxl

In [69]:
# You can also investigating the unique values for Country column using set data structure
unique_gdp_v2 = set(c_data.GDP)
print(unique_gdp_v2)

{'17098', '688177', '2824', '58066', '2383', '15672', '242632', '949', '113420', '578604', '40828', '12607', '75709', '10492', '1108149', '579267', '76523', '70334', '8126', '3073', '3163', '1675419', '632770', '495341', '13563', '43069', '17963171', '237296', '14616', '807706', '547', '343939', '14421', '905988', '20678', '13313', '6096', '3515', '24528', '301262', '6190', '151', '405870', '113435', '95003', '100562011', '106714', '4854', '78721', '984', '13970', '251945', '18885', '522033', '280826', '114667', '3385090', '41722', '191913', '1397509', '134182', '290924', '247234', '991115', '11814', '2315', '21072', '178789', '29957', '2065', '2553', '12897', '832', '46665', '47452', '44391', '477386', '20254', '21228', '219066', '4943', '460201', '1920096', '20352', '223', '1634', '10375', '1256', '2240422', '359839', '59364', '30633', '2139840', '466789', '17851', '12704', '13165', '51662', '160503', '24605', '82275', '220623', '585939', '60', '280', '301025', '476748', '2010432', '

In [70]:
#Deleting rows in GDP column containing the special character (—)
c_data = c_data[~c_data.GDP.str.contains("—", regex=False)]
c_data

Unnamed: 0,Country,UN region,Forecast,Year,GDP
0,World,—,104476432,2023,100562011
1,United States,Americas,26949643,2023,25462700
2,China,Asia,17700899,2023,17963171
3,Germany,Europe,4429838,2023,4072192
4,Japan,Asia,4230862,2023,4231141
...,...,...,...,...,...
205,Micronesia,Oceania,458,2023,427
208,Marshall Islands,Oceania,277,2023,280
210,Kiribati,Oceania,246,2023,223
211,Nauru,Oceania,150,2023,151


In [71]:
# You can also investigating the unique values for Country column using set data structure
unique_un_region_v2 = set(c_data['UN region'])
print(unique_un_region_v2)

{'Europe', 'Africa', '—', 'Americas', 'Oceania', 'Asia'}


In [72]:
#Deleting rows in UN Region column containing the special character (—)
c_data = c_data[~c_data['UN region'].str.contains("—", regex=False)]
c_data

Unnamed: 0,Country,UN region,Forecast,Year,GDP
1,United States,Americas,26949643,2023,25462700
2,China,Asia,17700899,2023,17963171
3,Germany,Europe,4429838,2023,4072192
4,Japan,Asia,4230862,2023,4231141
5,India,Asia,3732224,2023,3385090
...,...,...,...,...,...
205,Micronesia,Oceania,458,2023,427
208,Marshall Islands,Oceania,277,2023,280
210,Kiribati,Oceania,246,2023,223
211,Nauru,Oceania,150,2023,151


In [73]:
unique_un_region_v2 = set(c_data['UN region'])
print(unique_un_region_v2)

{'Africa', 'Europe', 'Americas', 'Oceania', 'Asia'}


In [74]:
# Confirming the structure of our dataframe
c_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 181 entries, 1 to 213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Country    181 non-null    object
 1   UN region  181 non-null    object
 2   Forecast   181 non-null    object
 3   Year       181 non-null    object
 4   GDP        181 non-null    object
dtypes: object(5)
memory usage: 8.5+ KB


In [None]:
# Confirming the number of columns
print(f"The number of columns are: {len(c_data.columns)}")
c_data.columns

In [75]:
# Changing all the columns with the object data type to string
number_columns = ['Country', 'UN region']
for column in number_columns:
    c_data[column] = c_data[column].astype ("string")
    
number_columns = ['GDP', 'Year','Forecast']
for column in number_columns:
    c_data[column] = c_data[column].astype ("int32")
    
c_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 181 entries, 1 to 213
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Country    181 non-null    string
 1   UN region  181 non-null    string
 2   Forecast   181 non-null    int32 
 3   Year       181 non-null    int32 
 4   GDP        181 non-null    int32 
dtypes: int32(3), string(2)
memory usage: 6.4 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_data[column] = c_data[column].astype ("string")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_data[column] = c_data[column].astype ("int32")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  c_data[column] = c_data[column].astype ("int32")
A value is trying to be set on a copy of a slice from a D

In [76]:
# Groupby allows you to group rows based on a column and perform a function
# Mean GDP by region
c_data[["UN region","GDP"]].groupby("UN region").mean()

Unnamed: 0_level_0,GDP
UN region,Unnamed: 1_level_1
Africa,56500.192308
Americas,982007.647059
Asia,880241.761905
Europe,578681.195122
Oceania,163565.166667


In [77]:
# Groupby allows you to group rows based on a column and perform a function
# Mean GDP by region
c_data[["UN region","GDP"]].groupby("UN region").sum()

Unnamed: 0_level_0,GDP
UN region,Unnamed: 1_level_1
Africa,2938010
Americas,33388260
Asia,36970154
Europe,23725929
Oceania,1962782


In [78]:
# Median GDP by region
c_data[["UN region","GDP"]].groupby("UN region").median()

Unnamed: 0_level_0,GDP
UN region,Unnamed: 1_level_1
Africa,17626.5
Americas,37105.5
Asia,202590.5
Europe,178789.0
Oceania,908.0


In [79]:
# Taking a final look at our c_data dataframe
c_data[:5]

Unnamed: 0,Country,UN region,Forecast,Year,GDP
1,United States,Americas,26949643,2023,25462700
2,China,Asia,17700899,2023,17963171
3,Germany,Europe,4429838,2023,4072192
4,Japan,Asia,4230862,2023,4231141
5,India,Asia,3732224,2023,3385090


In [80]:
# Exporting our c_data dataframe as MS Excel File
filename = "Country_GDP.xlsx"
try:
    c_data.to_excel(filename)
except:
    print("Error: Unable to convert dataframe to a csv file")
else:
    print(f"The {filename} file successfully created.")


The Country_GDP.xlsx file successfully created.


## Dealing with "HTTP Error 403: Forbidden" with Pandas

In [81]:
investment_data = pd.read_html("https://www.investing.com/earnings-calendar/")[1]
investment_data

HTTPError: HTTP Error 403: Forbidden

In [None]:
# !pip install html5lib

In [82]:
## Let's try to read a secure site
import requests
import pandas as pd
url_add = "https://www.investing.com/earnings-calendar/"
response = requests.get(url_add)

if response.status_code == 200:
    df = pd.read_html(response.content)[0]
    print(df.head())
else:
    print(f'Request failed: HTTP Error 403: Forbidden')

          Unnamed: 0_level_0                             Company  \
          Unnamed: 0_level_1                             Company   
0  Monday, November 27, 2023           Monday, November 27, 2023   
1                        NaN  British American Tobacco ADR (BTI)   
2                        NaN              Johnson Controls (JCI)   
3                        NaN         Prudential Public ADR (PUK)   
4                        NaN              Lonza Group AG (LZAGY)   

                         EPS                 / Forecast  \
                         EPS                 / Forecast   
0  Monday, November 27, 2023  Monday, November 27, 2023   
1                         --                       / --   
2                        1.1                     / 1.09   
3                         --                       / --   
4                         --                   / 0.9164   

                     Revenue                 / Forecast  \
                     Revenue               / Forec

In [83]:
df

Unnamed: 0_level_0,Unnamed: 0_level_0,Company,EPS,/ Forecast,Revenue,/ Forecast,Market Cap,Time,Unnamed: 8_level_0
Unnamed: 0_level_1,Unnamed: 0_level_1.1,Company,EPS,/ Forecast,Revenue,/ Forecast.1,Market Cap,Time,Unnamed: 8_level_1
0,"Monday, November 27, 2023","Monday, November 27, 2023","Monday, November 27, 2023","Monday, November 27, 2023","Monday, November 27, 2023","Monday, November 27, 2023","Monday, November 27, 2023","Monday, November 27, 2023","Monday, November 27, 2023"
1,,British American Tobacco ADR (BTI),--,/ --,--,/ --,71.85B,,
2,,Johnson Controls (JCI),1.1,/ 1.09,7.11B,/ 7.1B,35.65B,,
3,,Prudential Public ADR (PUK),--,/ --,--,/ --,31.85B,,
4,,Lonza Group AG (LZAGY),--,/ 0.9164,--,/ 3.24B,29.49B,,
...,...,...,...,...,...,...,...,...,...
287,,EnSync (ESNC),--,/ --,--,/ --,68,,
288,,Acura Pharma (ACUR),--,/ --,--,/ --,66,,
289,,Great Panther Mining (GPLDF),--,/ --,--,/ --,47,,
290,,Anthera Pharma (ANTH),--,/ --,--,/ --,26,,


### Solving "HTTP Error 403: Forbidden" with request module

In [84]:
import pandas as pd
import requests

url = 'https://tradingeconomics.com/currencies'
header = {
  "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36",
  "X-Requested-With": "XMLHttpRequest"
}

content = requests.get(url, headers=header)
secured_data = pd.read_html(content.text)
secured_data[0]

  secured_data = pd.read_html(content.text)


Unnamed: 0.1,Unnamed: 0,Major,Price,Day,%,Weekly,Monthly,YoY,Date
0,,EURUSD,1.09333,0.0029,0.27%,0.24%,3.51%,5.18%,Nov/24
1,,GBPUSD,1.25939,0.0061,0.49%,1.07%,4.01%,4.14%,Nov/24
2,,AUDUSD,0.65838,0.00268,0.41%,1.16%,4.37%,-2.45%,Nov/24
3,,NZDUSD,0.60798,0.0033,0.54%,1.53%,4.79%,-2.63%,Nov/24
4,,USDJPY,149.432,0.128,-0.09%,-0.13%,-0.52%,7.43%,Nov/24
5,,USDCNY,7.14303,0.0058,-0.08%,-1.03%,-2.53%,-0.73%,Nov/24
6,,USDCHF,0.88099,0.0032,-0.36%,-0.51%,-1.75%,-6.80%,Nov/24
7,,USDCAD,1.36238,0.0068,-0.50%,-0.69%,-1.24%,1.85%,Nov/24
8,,USDMXN,17.1008,0.0782,-0.46%,-0.61%,-6.60%,-11.53%,Nov/24
9,,USDINR,83.34,0.035,0.04%,0.12%,0.24%,2.03%,Nov/24


### Pandas authorization by user and password

In [None]:
import requests
import pandas as pd

url = 'https://en-gb.facebook.com/login.php'
username = 'dr_sam@yahoo.com'
password = 'XXueyjdhhernf'

response = requests.get(url, auth=(username, password))

if response.status_code == 200:
    df = pd.read_html(response.content)[0]
    print(df.head())
else:
    print(f'Request failed')