
# Setup


## Download the required libraries

In [None]:
import sys

print( sys.executable )

!"{sys.executable}" -u -m pip install numpy
!"{sys.executable}" -u -m pip install pandas
!"{sys.executable}" -u -m pip install matplotlib
!"{sys.executable}" -u -m pip install tabulate



## Import the required libraries


In [None]:
import numpy as numpy
import pandas as pandas
import matplotlib.pyplot as plt

from IPython.display import HTML, display
import tabulate



## Define Useful Function

In [None]:

def displayAsTable( lst: list[list] ) -> None:
    display(
        HTML(
            tabulate.tabulate(table, tablefmt='html')
        )
    )



# Exploratory Data Analysis


## Load the datasets and combine them into one

These data sets exist in four separate files USDA website only lets us download a maximum number of rows of 50,000 per file.

In [None]:

df_1 = pandas.read_csv (r'data/CropYields-1909-1939.csv')
df_2 = pandas.read_csv (r'data/CropYields-1940-1969.csv')
df_3 = pandas.read_csv (r'data/CropYields-1970-1989.csv')
df_4 = pandas.read_csv (r'data/CropYields-1990-2007.csv')

df = pandas.concat([df_1, df_2, df_3, df_4])




## We take a look at the column names and the first couple of rows


In [None]:
df.head()


We can see that there seems to be meta-data about this data set, as well as some missing data.

We therefore choose to look at the unique values in each column.



## We take a look at the unique values in each column


### Global Analysis of Columns

In [None]:

for col in df:
    print(col + ": ", df[col].unique())
    print("-------------------------------------------------------------------------------------------")



As can be seen some columns only hold meta-data about this dataset. These columns are are: Program, Period, Geo Level, Commodity, Data Item, Domain, and Domain Category.

We propose to drop them.


In [None]:
df = df.drop(columns=['Program', 'Period', 'Geo Level', 'Commodity', 'Data Item', 'Domain', 'Domain Category'])


Also it is not clear what the CV (%) column is about, and the data within it is all NaN.

Therfore, we also chose to drop this column.


In [None]:
df = df.drop(columns=['CV (%)'])

We can now start to take a look at each column in more details

In [None]:
df.head()


### The Year Column


In [None]:
years = df["Year"]

#### Missing Years

In [None]:

unique = numpy.sort(years.unique())

min_year = unique[0]
max_year = unique[len(unique) - 1]


print( f"The number of years between {min_year} and {max_year} is: {max_year - min_year}" )
print( f"The number of years recorded in the dataset is: {len(unique)}" )


As we can see, there seems to be 7 years missing in the dataset.

We would like to see which stretches of time correspond to this missing data.

In [None]:

table = []
for i in range( len(unique) - 1 ):
    if unique[i+1] - unique[i] > 1:
        table.append( [f"{unique[i] + 1}-{unique[i+1]-1}", (unique[i+1] - 1) - (unique[i] + 1)] )

print("The missing years are:")
displayAsTable(table)


We can therefore confirm that that all of the missing years occur between 1909 and 1918.

We can also notice that all of the missing years occur at the beginning of the records, with only one year (1909) preceding them.

Futhuremore, these missing dates are quite old.

Therefore, we can assume that removing the 1909 date from the record is safe, and that it will not affect the trends observed.

In this way we will have a continuous stretch of time.

However, for now we will keep it as we could maybe make use of it later.

#### Distribution of Year Records

We would now like to see how many records exists for each year.

In [None]:

counts_numpy = numpy.unique(years, return_counts=True)
counts = [counts_numpy[0].tolist(), counts_numpy[1].tolist()]

# Figure Size
fig = plt.figure(figsize =(10, 7))

plt.title("Number of Records per Year")
plt.xlabel("years")
plt.ylabel("number of records")
plt.bar(counts[0], counts[1])
 
# Show Plot
plt.show()


As we can see the number of records per year are somewhat not distributed evenly.

The earlier you go in time, the less records are available.

However, that is not to say that older records are sparse.

Even in the 1920s we can see an average of 500 records per year which is not necessarily that far off from the maximum which is around 2500.

And not that far of from the mean of records per year: 

In [None]:
print("mean = ", numpy.mean(counts_numpy[1]), "variance = ", numpy.std(counts_numpy[1]))

However, we now have the full justification to get rid of the data points that have a year of 1909 as the number of records associated with this year are vey few compared to other years.

And this is in addition of the previous problems presented on this year.

In [None]:

indices = df[df["Year"] == 1909].index
df.drop(indices, inplace=True)


## State and State ANSI

We now look at the States and State ANSI columns