Este notebook esta destinado al estudio de los cultivos en EEUU. 
Para la descarga de los datos se ha utilizado una dirección API proporcionada por la organización USDA (United States Department of Agriculture).
Los siguientes enlaces disponibles permiten entrar en el repositorio de la API y también a una interfaz en la web de USDA la cual permite visualizar de manera más sencilla los datos disponibles:
- Fuente API: https://www.robertdinterman.com/usdarnass/articles/usdarnass
- Interfaz: https://quickstats.nass.usda.gov/#60CB39DB-E74C-3619-8851-E0EAD520AACA

In [1]:
#--BASE--#
import pandas as pd
import seaborn as sns
import warnings
import numpy as np
import datetime
import os

#--VISUALIZACIÓN--#
import matplotlib as mpl
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff

#--CONFIGURACIÓN--#
pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:.2f}'.format
warnings.filterwarnings('ignore')

Para la descarga de los datos a través de la API lo normal ha sido hacer una petición para cada año de estudio. Debido a que la API nos permite realizar un total de 50.000 valores por petición, en algunos años este valor se ha superado por lo que se ha hecho una descarga para cada año y estado, que a continuación concatenamos en un solo DataFrame.

#### 2002

In [2]:
path = '../DATOS/API_superficie_cultivada/2002/'
files = [os.path.join(path, file) for file in os.listdir(path) if ".csv" in file]
df = pd.concat(map(pd.read_csv, files))
df.to_csv('../DATOS/API_superficie_cultivada/TOTAL/2002.csv', index = False)

#### 2007

In [3]:
path = '../DATOS/API_superficie_cultivada/2007/'
files = [os.path.join(path, file) for file in os.listdir(path) if ".csv" in file]
df = pd.concat(map(pd.read_csv, files))
df.to_csv('../DATOS/API_superficie_cultivada/TOTAL/2007.csv', index = False)


#### 2009

In [4]:
path = '../DATOS/API_superficie_cultivada/2009/'
files = [os.path.join(path, file) for file in os.listdir(path) if ".csv" in file]
df = pd.concat(map(pd.read_csv, files))
df.to_csv('../DATOS/API_superficie_cultivada/TOTAL/2009.csv', index = False)

#### 2012

In [5]:
path = '../DATOS/API_superficie_cultivada/2012/'
files = [os.path.join(path, file) for file in os.listdir(path) if ".csv" in file]
df = pd.concat(map(pd.read_csv, files))
df.to_csv('../DATOS/API_superficie_cultivada/TOTAL/2012.csv', index = False)


#### 2014

In [6]:
path = '../DATOS/API_superficie_cultivada/2014/'
files = [os.path.join(path, file) for file in os.listdir(path) if ".csv" in file]
df = pd.concat(map(pd.read_csv, files))
df.to_csv('../DATOS/API_superficie_cultivada/TOTAL/2014.csv', index = False)

#### 2019

In [7]:
path = '../DATOS/API_superficie_cultivada/2019/'
files = [os.path.join(path, file) for file in os.listdir(path) if ".csv" in file]
df = pd.concat(map(pd.read_csv, files))
df.to_csv('../DATOS/API_superficie_cultivada/TOTAL/2019.csv', index = False)

### Total

In [8]:
path = '../DATOS/API_superficie_cultivada/TOTAL/'
files = [os.path.join(path, file) for file in os.listdir(path) if ".csv" in file]
df = pd.concat(map(pd.read_csv, files))
df.drop(columns= 'Unnamed: 0', inplace = True)

In [9]:
df.isnull().sum()

source_desc                    0
sector_desc                    0
group_desc                     0
commodity_desc                 0
class_desc                     0
prodn_practice_desc            0
util_practice_desc             0
statisticcat_desc              0
unit_desc                      0
short_desc                     0
domain_desc                    0
domaincat_desc                 0
agg_level_desc                 0
state_ansi                  6368
state_fips_code                0
state_alpha                    0
state_name                     0
asd_code                 2256586
asd_desc                 2256586
county_ansi              2256586
county_code              2256586
county_name              2256586
region_desc              2256586
zip_5                    2256586
watershed_code                 0
watershed_desc           2256586
congr_district_code      2256586
country_code                   0
country_name                   0
location_desc                  0
year      

In [10]:
df.head()

Unnamed: 0,source_desc,sector_desc,group_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,agg_level_desc,state_ansi,state_fips_code,state_alpha,state_name,asd_code,asd_desc,county_ansi,county_code,county_name,region_desc,zip_5,watershed_code,watershed_desc,congr_district_code,country_code,country_name,location_desc,year,freq_desc,begin_code,end_code,reference_period_desc,week_ending,load_time,Value,CV (%)
0,SURVEY,CROPS,FIELD CROPS,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,BARLEY - ACRES HARVESTED,TOTAL,NOT SPECIFIED,STATE,4.0,4,AZ,ARIZONA,,,,,,,,0,,,9000,UNITED STATES,ARIZONA,1950,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00.000,157000,
1,SURVEY,CROPS,FIELD CROPS,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,BARLEY - ACRES HARVESTED,TOTAL,NOT SPECIFIED,STATE,5.0,5,AR,ARKANSAS,,,,,,,,0,,,9000,UNITED STATES,ARKANSAS,1950,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00.000,4000,
2,SURVEY,CROPS,FIELD CROPS,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,BARLEY - ACRES HARVESTED,TOTAL,NOT SPECIFIED,STATE,6.0,6,CA,CALIFORNIA,,,,,,,,0,,,9000,UNITED STATES,CALIFORNIA,1950,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00.000,1765000,
3,SURVEY,CROPS,FIELD CROPS,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,BARLEY - ACRES HARVESTED,TOTAL,NOT SPECIFIED,STATE,8.0,8,CO,COLORADO,,,,,,,,0,,,9000,UNITED STATES,COLORADO,1950,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00.000,489000,
4,SURVEY,CROPS,FIELD CROPS,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,BARLEY - ACRES HARVESTED,TOTAL,NOT SPECIFIED,STATE,10.0,10,DE,DELAWARE,,,,,,,,0,,,9000,UNITED STATES,DELAWARE,1950,ANNUAL,0,0,YEAR,,2012-01-01 00:00:00.000,12000,


Analizamos las columnas que nos proporciona la API y finalmente se seleccionan las que son de nuestro interes. El resto de variable se eliminan ya que son columnas que unicamente contiene valores núlos o son información adicional del resto de columnas seleccionadas. A continuación se muestra la selección de variables. 

In [11]:
df = df[['source_desc', 'commodity_desc', 'class_desc', 'prodn_practice_desc', 'util_practice_desc', 'statisticcat_desc', 'unit_desc', 'short_desc', 'domain_desc', 'domaincat_desc', 'state_name', 'year', 'Value']]

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2256586 entries, 0 to 49527
Data columns (total 13 columns):
 #   Column               Dtype 
---  ------               ----- 
 0   source_desc          object
 1   commodity_desc       object
 2   class_desc           object
 3   prodn_practice_desc  object
 4   util_practice_desc   object
 5   statisticcat_desc    object
 6   unit_desc            object
 7   short_desc           object
 8   domain_desc          object
 9   domaincat_desc       object
 10  state_name           object
 11  year                 int64 
 12  Value                object
dtypes: int64(1), object(12)
memory usage: 241.0+ MB


En la columna 'unit_desc' aparecen un gran número de variables las cuales definen diferentes unidades de medida, para este caso filtramos por "ACRES"

In [13]:
df = df[df['unit_desc'] == 'ACRES']

In [14]:
# Años disponbies para el estudio con superficies de cultivo

df['year'].drop_duplicates().sort_values(ascending=True)

0      1950
0      1951
0      1952
0      1953
0      1954
       ... 
100    2016
0      2018
7      2019
50     2020
4      2021
Name: year, Length: 71, dtype: int64

In [15]:
#Explorando el dataset se ven valores extraños en la columna "Value" por lo que se eliminan

df = df[df['Value'] != '(D)']
df = df[df['Value'] != '(Z)']
df = df[df['Value'] != '(X)']
df = df[df['Value'] != '(NA)']
df = df[df['Value'] != '(O)']
df = df[df['Value'] != '(1)']

In [16]:
# Se modifican los valores de la de la columna 'Value' para poder convertir los datos a "float"
df["Value"] = [float(str(i).replace(",", "")) for i in df["Value"]]

In [17]:
df['Value'] = df['Value'].astype(float)

In [18]:
fig = px.line(
    pd.DataFrame(df.groupby('year')['Value'].sum()),
    title='Evolución histórica de la superfice de cultivo'
    )
fig.update_yaxes(title_text='Hectáreas')
fig.update_xaxes(title_text='')
fig.show()  

In [19]:
x = df[df['source_desc'] == 'SURVEY']
x.groupby('year')['year'].count()

year
1950    1728
1951    1723
1952    1710
1953    1718
1954    1767
        ... 
2016    2376
2018    2359
2019    1948
2020    1893
2021    2052
Name: year, Length: 71, dtype: int64

In [20]:
y = df[df['source_desc'] == 'CENSUS']
y.groupby('year')['year'].count()

year
1997    11199
2002    27008
2007    30786
2008     3219
2009      750
2011     1188
2012    90432
2013     6148
2014     4364
2015     1679
2016     1648
2018     5221
2019     2677
Name: year, dtype: int64

Los picos que aparecen en la gráfica anterior hacen referencia al los datos de Censos, que aparecen cada cinco años; estos se eliminan de la serie.

In [21]:
df = df[df['source_desc'] == 'SURVEY']

Haciendo referencia a la columna "statisticcat_desc"; Las variables que aparecen no se sabe si se complentan entre ellas mismas o hace referencia a las mismas unidades de superficie. En las sigiuentes celdas se estudian estas variables.
Finalmente se encuentra que:

- La variable "TOTAL AREA" se superpone con la variable "AREA BEARING". Se llega a esta conclusión ya que como se ve en el ejemplo sigiuente los valores de superficies en Melocoton son muy similares entre ambas variables. Finalmente se decide eliminar las filas con la variable "TOTAL AREA" ya que solo aparecen en dos ocasiones (las representadas en el ejemplo).

In [22]:
df[df['statisticcat_desc'] == 'TOTAL AREA'].drop_duplicates()

Unnamed: 0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
29564,SURVEY,PEACHES,FREESTONE,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,TOTAL AREA,ACRES,"PEACHES, FREESTONE - ACRES TOTAL",TOTAL,NOT SPECIFIED,CALIFORNIA,2018,21600.0
5852,SURVEY,PEACHES,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,TOTAL AREA,ACRES,PEACHES - ACRES TOTAL,TOTAL,NOT SPECIFIED,NEW JERSEY,2020,4000.0


In [23]:
df[
    (df['commodity_desc'] == 'PEACHES') & 
    (df['state_name'] == 'CALIFORNIA') & 
    (df['class_desc'] == 'FREESTONE') & 
    (df['year'] == 2018)
    ]

Unnamed: 0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
29562,SURVEY,PEACHES,FREESTONE,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA BEARING,ACRES,"PEACHES, FREESTONE - ACRES BEARING",TOTAL,NOT SPECIFIED,CALIFORNIA,2018,20000.0
29564,SURVEY,PEACHES,FREESTONE,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,TOTAL AREA,ACRES,"PEACHES, FREESTONE - ACRES TOTAL",TOTAL,NOT SPECIFIED,CALIFORNIA,2018,21600.0


In [24]:
df[
    (df['commodity_desc'] == 'PEACHES') & 
    (df['state_name'] == 'NEW JERSEY') & 
    (df['year'] == 2020)
    ]

Unnamed: 0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
5839,SURVEY,PEACHES,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA BEARING,ACRES,PEACHES - ACRES BEARING,TOTAL,NOT SPECIFIED,NEW JERSEY,2020,3800.0
5852,SURVEY,PEACHES,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,TOTAL AREA,ACRES,PEACHES - ACRES TOTAL,TOTAL,NOT SPECIFIED,NEW JERSEY,2020,4000.0


In [25]:
df = df[df['statisticcat_desc'] != 'TOTAL AREA']

Otra variable dentro de la columna "Category" que no son complementarias si no que suman las mismas unidades de sueprficie son:
- "AREA PLANTED" y "AREA HARVESTED" --> En este caso se decide por quedarse con la Variable "AREA PLANTED" ya que tiene una mayor representación sobre la superficie cultivada.

In [26]:
# Ejemplo:
df[
    (df['commodity_desc'] == 'COTTON') & 
    (df['state_name'] == 'ALABAMA') & 
    (df['year'] == 1951)
    ]

Unnamed: 0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
749,SURVEY,COTTON,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,COTTON - ACRES HARVESTED,TOTAL,NOT SPECIFIED,ALABAMA,1951,1490000.0
769,SURVEY,COTTON,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,COTTON - ACRES PLANTED,TOTAL,NOT SPECIFIED,ALABAMA,1951,1530000.0
885,SURVEY,COTTON,UPLAND,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA HARVESTED,ACRES,"COTTON, UPLAND - ACRES HARVESTED",TOTAL,NOT SPECIFIED,ALABAMA,1951,1490000.0
896,SURVEY,COTTON,UPLAND,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"COTTON, UPLAND - ACRES PLANTED",TOTAL,NOT SPECIFIED,ALABAMA,1951,1499000.0


In [27]:
df = df[df['statisticcat_desc'] != 'AREA HARVESTED']

En el siguiente caso se estudia la variable de "AREA IN PRODUCTION" y se observa que solo aparece para los productos de floricultura. Es a partir de esta variable en la que se observa que los productos provenientes de floricultura aparcen repetidos debido a que en la columna "Operation" aparecen las variables "SALES OF FLORICULTURE"  y  "TOTALS" para un mismo producto. Se decide eliminar la variable "SALES OF FLORICULTURE".

In [28]:
#Ejemplo:
df[
    (df['commodity_desc'] == 'FLORICULTURE TOTALS') & 
    (df['state_name'] == 'CONNECTICUT') & 
    (df['year'] == 2021)
    ]

Unnamed: 0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
35573,SURVEY,FLORICULTURE TOTALS,ALL CLASSES,IN THE OPEN,ALL UTILIZATION PRACTICES,AREA IN PRODUCTION,ACRES,"FLORICULTURE TOTALS, IN THE OPEN - ACRES IN PR...",SALES OF FLORICULTURE,"SALES OF FLORICULTURE: (100,000 OR MORE $)",CONNECTICUT,2021,499.0
35602,SURVEY,FLORICULTURE TOTALS,ALL CLASSES,IN THE OPEN,ALL UTILIZATION PRACTICES,AREA IN PRODUCTION,ACRES,"FLORICULTURE TOTALS, IN THE OPEN - ACRES IN PR...",TOTAL,NOT SPECIFIED,CONNECTICUT,2021,724.0


In [29]:
df = df[df['domain_desc'] != 'SALES OF FLORICULTURE']

También se elimina la variable "AREA PLANTED, NET", ya que esta variable describe lo mismo que la variable "AREA PLANTED" y unicamente aparece durante un rango de años concreto

In [30]:
#Ejemplo:
df[
    (df['commodity_desc'] == 'SORGHUM') & 
    (df['state_name'] == 'ARIZONA') & 
    (df['year'] == 1972)
    ]

Unnamed: 0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
2434,SURVEY,SORGHUM,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,SORGHUM - ACRES PLANTED,TOTAL,NOT SPECIFIED,ARIZONA,1972,120000.0
2480,SURVEY,SORGHUM,ALL CLASSES,ALL PRODUCTION PRACTICES,GRAIN,"AREA PLANTED, NET",ACRES,"SORGHUM, GRAIN - ACRES PLANTED, NET",TOTAL,NOT SPECIFIED,ARIZONA,1972,109000.0
2636,SURVEY,SORGHUM,ALL CLASSES,IRRIGATED,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,"SORGHUM, IRRIGATED - ACRES PLANTED",TOTAL,NOT SPECIFIED,ARIZONA,1972,120000.0
2646,SURVEY,SORGHUM,ALL CLASSES,IRRIGATED,GRAIN,"AREA PLANTED, NET",ACRES,"SORGHUM, GRAIN, IRRIGATED - ACRES PLANTED, NET",TOTAL,NOT SPECIFIED,ARIZONA,1972,109000.0


In [31]:
df = df[df['statisticcat_desc'] != 'AREA PLANTED, NET']

In [32]:
#los acres se pasan a hectáreas
df['Value'] = df['Value'] * 0.404686

In [33]:
fig = px.line(
    pd.DataFrame(df.groupby('year')['Value'].sum()),
    title='Evolución histórica de la superfice de cultivo'
    )
fig.update_yaxes(title_text='Hectáreas')
fig.update_xaxes(title_text='')
fig.show()  

El cambio brusco de unidades que aparece en la gráfica en el año 1993 es debido a la variable "FIELD CROP TOTAL" en la columna "commodity_desc". Esta se elimina ya que es información que se superpone con el resto de variables.

In [34]:
commodity = df.pivot_table(index='year', columns='commodity_desc', values='Value', aggfunc=np.sum)
fig = px.line(
    pd.DataFrame(commodity),
    title='Evolución histórica de la superfice de cultivo para cada cultivo'
    )
fig.update_yaxes(title_text='Hectáreas')
fig.update_xaxes(title_text='')
fig.show()  

In [35]:
df = df[df['commodity_desc'] != 'FIELD CROP TOTALS']

In [36]:
fig = px.line(
    pd.DataFrame(df.groupby('year')['Value'].sum()),
    title='Evolución histórica de la superfice de cultivo'
    )
fig.update_yaxes(title_text='Hectáreas')
fig.update_xaxes(title_text='')
fig.show()  

In [37]:
df.to_excel('../DATOS/API_superficie_cultivada/TOTAL/TOTAL.xlsx', index = False)

In [38]:
df.head(2)

Unnamed: 0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
40,SURVEY,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,BARLEY - ACRES PLANTED,TOTAL,NOT SPECIFIED,ARIZONA,1950,77295.03
41,SURVEY,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,BARLEY - ACRES PLANTED,TOTAL,NOT SPECIFIED,ARKANSAS,1950,2832.8


In [39]:
fig = px.bar(
    pd.DataFrame(df.groupby('state_name')['Value'].mean()).sort_values(by = 'Value', ascending=False),
    title='Superficie media en cada estado')
fig.update_yaxes(title_text='Hectáreas')
fig.update_xaxes(title_text='')
fig.show()   


In [40]:


fig = px.scatter(
    pd.DataFrame(df.groupby('year')['Value'].sum()),
    title='Evolución histórica de la superfice de cultivo', 
    trendline='lowess'
    )
fig.update_yaxes(title_text='Hectáreas')
fig.update_xaxes(title_text='')
fig.show()  

##### Evolución de temperatura por año y estado representado en el mapa de EEUU
Para esta representación se ha utilizado un mapa cloropletico de estados unidos a partir de la punción de plotly: "choropleth", para ello es necesario incorporar en el DataFrame los codigos de cada estado.

In [41]:
#Creación de un diccionario con los codigos de cada estado

columns = {
    'AL':'Alabama',      'AK':'Alaska',       'AZ':'Arizona',
    'AR':'Arkansas',     'CA':'California',   'CO':'Colorado',
    'CT':'Connecticut',  'DE':'Delaware',     'FL':'Florida',
    'GA':'Georgia',      'ID':'Idaho',        'IL':'Illinois',
    'IN':'Indiana',      'IA':'Iowa',         'KS':'Kansas',
    'KY':'Kentucky',     'LA':'Louisiana',    'ME':'Maine',
    'MD':'Maryland',     'MA':'Massachusetts','MI':'Michigan',
    'MN':'Minnesota',    'MS':'Mississippi',  'MO':'Missouri',
    'MT':'Montana',      'NE':'Nebraska',     'NV':'Nevada',
    'NH':'New_Hampshire','NJ':'New_Jersey',   'NM':'New_Mexico',
    'NY':'New_York',     'NC':'North_Carolina','ND':'North_Dakota',
    'OH':'Ohio',         'OK':'Oklahoma',     'OR':'Oregon',
    'PA':'Pennsylvania', 'RI':'Rhode_Island', 'SC':'South_Carolina',
    'SD':'South_Dakota', 'TN':'Tennessee',    'TX':'Texas',
    'UT':'Utah',         'VT':'Vermont',      'VA':'Virginia',
    'WA':'Washington',   'WV':'West_Virginia','WI':'Wisconsin',
    'WY':'Wyoming',      'HI':'Hawaii'
}

In [42]:
df_code_states = pd.DataFrame([[key, columns[key]] for key in columns.keys()], columns=['code', 'state'])
df_code_states['state'] = df_code_states['state'].str.upper()
df_code_states.head(50)

Unnamed: 0,code,state
0,AL,ALABAMA
1,AK,ALASKA
2,AZ,ARIZONA
3,AR,ARKANSAS
4,CA,CALIFORNIA
5,CO,COLORADO
6,CT,CONNECTICUT
7,DE,DELAWARE
8,FL,FLORIDA
9,GA,GEORGIA


In [43]:
df["state_name"] = [(str(i).replace(" ","_")) for i in df["state_name"]]

In [44]:

df['state_name'].drop_duplicates()


40             ARIZONA
41            ARKANSAS
42          CALIFORNIA
43            COLORADO
44            DELAWARE
45             GEORGIA
46               IDAHO
47            ILLINOIS
48             INDIANA
49                IOWA
50              KANSAS
51            KENTUCKY
52               MAINE
53            MARYLAND
54            MICHIGAN
55           MINNESOTA
56            MISSOURI
57             MONTANA
58            NEBRASKA
59              NEVADA
60          NEW_JERSEY
61          NEW_MEXICO
62            NEW_YORK
63      NORTH_CAROLINA
64        NORTH_DAKOTA
65                OHIO
66            OKLAHOMA
67              OREGON
68        PENNSYLVANIA
69      SOUTH_CAROLINA
70        SOUTH_DAKOTA
71           TENNESSEE
72               TEXAS
73                UTAH
74             VERMONT
75            VIRGINIA
76          WASHINGTON
77       WEST_VIRGINIA
78           WISCONSIN
79             WYOMING
359            ALABAMA
364        CONNECTICUT
366            FLORIDA
374        

In [45]:
#Se integran los codigos en el dataframe

df_map = df.copy()
df_map = df_map.merge(df_code_states, left_on = 'state_name', right_on='state', how='outer')
df_map.drop('state', axis = 1, inplace = True)

In [46]:
df_map.isnull().sum()

source_desc              0
commodity_desc           0
class_desc               0
prodn_practice_desc      0
util_practice_desc       0
statisticcat_desc        0
unit_desc                0
short_desc               0
domain_desc              0
domaincat_desc           0
state_name               0
year                     0
Value                    0
code                   497
dtype: int64

In [47]:
df_map[['state_name', 'code']].drop_duplicates()

Unnamed: 0,state_name,code
0,ARIZONA,AZ
1427,ARKANSAS,AR
2609,CALIFORNIA,CA
6435,COLORADO,CO
8833,DELAWARE,DE
9531,GEORGIA,GA
10646,IDAHO,ID
12847,ILLINOIS,IL
13708,INDIANA,IN
14526,IOWA,IA


In [48]:
#Representación del mapa de EEUU y la evolución de la superficie agrícola para cada estado a lo largo del rango de años

fig = px.choropleth(df_map, locations="code", 
                    locationmode= "USA-states",
                    color="Value", 
                    color_continuous_scale='algae',
                    hover_name="state_name",
                    scope="usa",
                    hover_data=['Value'],
                    animation_frame =df_map.year,
                    labels={'Value':'Hectáreas'},
                    title = 'Evolución de la superficie agrícola')
fig.update_layout(height=600)
fig.show()


#reference: https://plotly.com/python/choropleth-maps/

In [49]:
ñ

NameError: name 'ñ' is not defined

Otro columna que llama la atención es "Variety of Product" ya que esta nos divide los cultivos en función de las practicas de cultivo. En este caso se decide por seleccionar unicamente la variable "ALL PRODUCTION PRACTICES" ya que se entiende que integra al resto de practias. Se llega a esta conclusión ya que para todos los cultivos aparece esta variable pudiendo aparecer o no el resto de variables en cada cultivo.

In [None]:
all_production_practicies = df.pivot_table(index='short_desc', columns='prodn_practice_desc', values='Value', aggfunc=np.sum)
all_production_practicies.head(20)

prodn_practice_desc,ALL PRODUCTION PRACTICES,FOLLOWING ANOTHER CROP (DOUBLE CROPPED),IN THE OPEN,IRRIGATED,NON-IRRIGATED,"NON-IRRIGATED, CONTINUOUS CROP","NON-IRRIGATED, FOLLOWING SUMMER FALLOW",NOT FOLLOWING ANOTHER CROP
short_desc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ALMONDS - ACRES BEARING,5458404.77,,,,,,,
APPLES - ACRES BEARING,1901461.69,,,,,,,
APRICOTS - ACRES BEARING,59739.75,,,,,,,
ARTICHOKES - ACRES PLANTED,71305.67,,,,,,,
ASPARAGUS - ACRES PLANTED,427214.87,,,,,,,
AVOCADOS - ACRES BEARING,316262.11,,,,,,,
BARLEY - ACRES PLANTED,249081602.54,,,,,,,
"BARLEY, IRRIGATED - ACRES PLANTED",,,,26434465.88,,,,
"BARLEY, NON-IRRIGATED - ACRES PLANTED",,,,,114802809.68,,,
"BARLEY, NON-IRRIGATED, CONTINUOUS CROP - ACRES PLANTED",,,,,,58619119.18,,


In [None]:
df = df[df['prodn_practice_desc'] =='ALL PRODUCTION PRACTICES']

In [None]:
px.line(df.groupby('year')['Value'].sum())

In [None]:
############################ hasta aquí todo claro, revisar que este todo bien con los cambios que se han hecho y mirar si lo de abajo es verdad


In [None]:
df.columns

Index(['source_desc', 'commodity_desc', 'class_desc', 'prodn_practice_desc',
       'util_practice_desc', 'statisticcat_desc', 'unit_desc', 'short_desc',
       'domain_desc', 'domaincat_desc', 'state_name', 'year', 'Value'],
      dtype='object')

- Para el grupo de cultvo "CHICKPEAS" aparece dibidio en: "ALL CLASSES", "LARGE", y "SMALL". Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba los otros dos.
- Para el grupo de cultvo "COTTON" aparece dibidio en: "ALL CLASSES", "PIMA", y "UPLAND". Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba los otros dos.
- Para el grupo de cultvo "GRAPEFRUIT" aparece dibidio en: "ALL CLASSES", "RED SEEDLESS", y "WHITE SEEDLESS". Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.
- Para el grupo de cultvo "GRAPES" aparece dibidio en: "ALL CLASSES", "JUICE TYPE", "RAISIN TYPE", "TABLE TYPE", "WINE TYPE". Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.
- Para el grupo de cultvo "ORANGES" aparece dibidio en:"ALL CLASSES", "MID & NAVEL", "VALENCIA". Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.
- Para el grupo de cultvo "PEACHES" aparece dibidio en:"ALL CLASSES", "CLINGSTONE", "FREESTONE" . Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.
- Para el grupo de cultvo "PEARS" aparece dibidio en:"ALL CLASSES", "BARTLETT" . Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.
- Para el grupo de cultvo "POTATOES" aparece dibidio en:"ALL CLASSES", "FALL", "SPRING", "SUMMER", "WINTER" . Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.
- Para el grupo de cultvo "RICE" aparece dibidio en:"ALL CLASSES", "LONG GRAIN", "MEDIUM GRAIN", "SHORT GRAIN". Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.
- Para el grupo de cultvo "SUNFLOWER" aparece dibidio en:"ALL CLASSES", "NON-OIL TYPE", "OIL TYPE". Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.
- Para el grupo de cultvo "WHEAT" aparece dibidio en:"ALL CLASSES", "SPRING", "WINTER". Filtramos unicamente por el grupo "ALL CASSSES" ya que este engloba al resto.

In [None]:
df.reset_index(inplace = True)
df.reset_index(inplace = True)
df.drop(columns='index', inplace = True)
df.head(2)

Unnamed: 0,level_0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
0,0,SURVEY,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,BARLEY - ACRES PLANTED,TOTAL,NOT SPECIFIED,ARIZONA,1950,77295.03
1,1,SURVEY,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,BARLEY - ACRES PLANTED,TOTAL,NOT SPECIFIED,ARKANSAS,1950,2832.8


In [None]:
x1 = df[(df['commodity_desc'] == 'CHICKPEAS') & (df['class_desc'] != 'ALL CLASSES')].level_0
x2 = df[(df['commodity_desc'] == 'COTTON') & (df['class_desc'] != 'ALL CLASSES')].level_0
x3 = df[(df['commodity_desc'] == 'GRAPEFRUIT') & (df['class_desc'] != 'ALL CLASSES')].level_0
x4 = df[(df['commodity_desc'] == 'GRAPES') & (df['class_desc'] != 'ALL CLASSES')].level_0
x5 = df[(df['commodity_desc'] == 'ORANGES') & (df['class_desc'] != 'ALL CLASSES')].level_0
x6 = df[(df['commodity_desc'] == 'PEACHES') & (df['class_desc'] != 'ALL CLASSES')].level_0
x7 = df[(df['commodity_desc'] == 'PEARS') & (df['class_desc'] != 'ALL CLASSES')].level_0
x8 = df[(df['commodity_desc'] == 'POTATOES') & (df['class_desc'] != 'ALL CLASSES')].level_0
x9 = df[(df['commodity_desc'] == 'RICE') & (df['class_desc'] != 'ALL CLASSES')].level_0
x10 = df[(df['commodity_desc'] == 'SUNFLOWER') & (df['class_desc'] != 'ALL CLASSES')].level_0
x11 = df[(df['commodity_desc'] == 'WHEAT') & (df['class_desc'] != 'ALL CLASSES')].level_0
x_tot = pd.concat([x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11])

In [None]:
for i in range(len(df)):
    if (df.level_0[i] in x_tot) == True:
        df = df.drop([i], axis = 0)

In [None]:

px.line(df.groupby('year')['Value'].sum())

In [None]:
df.head(2)

Unnamed: 0,level_0,source_desc,commodity_desc,class_desc,prodn_practice_desc,util_practice_desc,statisticcat_desc,unit_desc,short_desc,domain_desc,domaincat_desc,state_name,year,Value
0,0,SURVEY,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,BARLEY - ACRES PLANTED,TOTAL,NOT SPECIFIED,ARIZONA,1950,77295.03
1,1,SURVEY,BARLEY,ALL CLASSES,ALL PRODUCTION PRACTICES,ALL UTILIZATION PRACTICES,AREA PLANTED,ACRES,BARLEY - ACRES PLANTED,TOTAL,NOT SPECIFIED,ARKANSAS,1950,2832.8


In [None]:
commodity = df.pivot_table(index='year', columns='commodity_desc', values='Value', aggfunc=np.sum)

In [None]:
px.line(commodity)

In [None]:
px.line(df.groupby('year')['Value'].sum())

In [None]:
df.drop(columns = ('level_0', 'source_desc', 'domaincat_desc', 'domain_desc', 'unit_desc', 'prodn_practice_desc' ), inplace = True)

In [None]:
df.to_excel('../Rstudio/TOTAL/TOTAL.xlsx', index = False)