In [None]:
!pip install pandas

In [None]:
import requests, json
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# specify the required format of the data requested from the api
headers = {
    "Content-type": "application/json",
    "Content-Type": "application/json;charset=UTF-8",
}

In [None]:
# make the request
request = requests.get("https://global-warming.org/api/co2-api", headers=headers)

In [None]:
# assign the json result to a variable called 'data'
data = request.json()

In [None]:
# confirm the data has the expected type
print(type(data))

In [None]:
# coerce the data into a dataframe
unformatted_df = pd.DataFrame.from_dict(data, orient="columns")

In [None]:
# verify the output
print(unformatted_df)

# Beging data cleaning
"""
Currently the dataframe exists in the following format:

                                                    co2
0     {'year': '2011', 'month': '1', 'day': '1', 'cy...
1     {'year': '2011', 'month': '1', 'day': '2', 'cy...
2     {'year': '2011', 'month': '1', 'day': '3', 'cy...
3     {'year': '2011', 'month': '1', 'day': '4', 'cy...
4     {'year': '2011', 'month': '1', 'day': '5', 'cy...

This is not optimal, I want to rearrange the data so that the keys of each dictionary/json
object are dataframe columns corresponding to their associated values. 
"""

In [None]:
# rearrange the data
df = pd.DataFrame(df['co2'].values.tolist())

In [None]:
print(df)

In [None]:
# Check the dataframe for any null or duplicate values

# rows with missing data
rows_null_data = df[df.isnull()]

# the sum null values in a specific row
sum_rows_null_data = df['trend'].isnull().sum()

# check for NaN under single column
nan_column = df['trend'].isnull().values.any()
print(nan_column)

# Data cleaning example
"""
The global warming dataset I have chosen has already been cleaned, and appears to be in good condition. 
For the sake of fulfilling the milestone requirements of the project, I will demonstrate some data cleansing
best practices with a dataset that is better suited for the same:

Dataset: Iris Species

Classify iris plants into three species in this classic dataset

https://www.kaggle.com/uciml/iris
"""

In [None]:
# This data is from a CSV import, the file can be found in the data folder of the project repo.
# import the csv file:
iris_species = pd.read_csv("data/iris.csv")
iris_species_dataframe = pd.DataFrame(iris_species)

In [None]:
print(iris_species_dataframe)

In [None]:
# get the number of distinct values in a column, in this case the number of different species in the species column
distinct_species = iris_species_dataframe["Species"].value_counts()
print(distinct_species)

In [None]:
# visually represent this breakdown on a bar-chart:
bar_chart = plt.bar([setosa, versicolor, virginica], [1, 2, 3])
plt.show()