In [1]:
import pandas as pd
from sqlalchemy import create_engine

### Extract CSVs into DataFrames

In [72]:
#application_file upload will be based on file location on local machine
application_file = r"C:\Users\bradl\Desktop\Git\nwBootCamp\ETL_Asylum\Resources\asylum-applications.csv"
application_df = pd.read_csv(application_file)
application_df.head()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),applied
0,2006,Afghanistan,AFG,Australia,AUS,14
1,2006,Albania,ALB,Australia,AUS,21
2,2006,Algeria,DZA,Australia,AUS,5
3,2006,Egypt,EGY,Australia,AUS,38
4,2006,Bahrain,BHR,Australia,AUS,11


In [21]:
decision_file = r"C:\Users\bradl\Desktop\Git\nwBootCamp\ETL_Asylum\Resources\asylum-decisions.csv"
decision_df = pd.read_csv(decision_file)
decision_df.head()

Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Recognized decisions,Complementary protection,Rejected decisions,Otherwise closed,Total decisions
0,2000,Afghanistan,AFG,Australia,AUS,38,0,25,0,63
1,2000,Albania,ALB,Australia,AUS,5,0,23,0,28
2,2000,Algeria,DZA,Australia,AUS,5,0,15,0,20
3,2000,Egypt,EGY,Australia,AUS,20,0,49,5,74
4,2000,Armenia,ARM,Australia,AUS,0,0,5,0,5


### Transform application DataFrame

In [22]:
application_df.shape

(90803, 6)

In [23]:
application_df["Country of origin"].value_counts()

Iraq                      1945
Dem. Rep. of the Congo    1932
Somalia                   1863
Afghanistan               1846
Iran (Islamic Rep. of)    1789
                          ... 
Gibraltar                    1
Cook Islands                 1
Anguilla                     1
Marshall Islands             1
Puerto Rico                  1
Name: Country of origin, Length: 212, dtype: int64

In [24]:
new_application_df = application_df[['Year', 'Country of origin', 'Country of asylum', 'applied']].copy()

new_application_transformed_df = new_application_df.rename(columns={"Year": "year",
                                                          "Country of origin": "country_of_origin",
                                                          "Country of asylum": "country_of_asylum",
                                                          "applied": "applied"
                                                         })
new_application_transformed_df.head()

Unnamed: 0,year,country_of_origin,country_of_asylum,applied
0,2006,Afghanistan,Australia,14
1,2006,Albania,Australia,21
2,2006,Algeria,Australia,5
3,2006,Egypt,Australia,38
4,2006,Bahrain,Australia,11


In [25]:
us_application_df = new_application_transformed_df.loc[new_application_transformed_df["country_of_asylum"] == "United States of America"]

us_application_df.reset_index(drop=True)

Unnamed: 0,year,country_of_origin,country_of_asylum,applied
0,2017,Albania,United States of America,5
1,2017,Egypt,United States of America,29
2,2017,Argentina,United States of America,5
3,2017,Bangladesh,United States of America,5
4,2017,Brazil,United States of America,52
...,...,...,...,...
5985,2005,Yemen,United States of America,13
5986,2005,Zambia,United States of America,16
5987,2005,Zimbabwe,United States of America,222
5988,2005,Stateless,United States of America,83


In [26]:
us_application_df["country_of_origin"].value_counts()

Albania             46
India               46
Indonesia           46
Jordan              46
Syrian Arab Rep.    46
                    ..
Iceland              1
Samoa                1
Curacao              1
Cayman Islands       1
Marshall Islands     1
Name: country_of_origin, Length: 191, dtype: int64

In [54]:
us_application_df_merge = us_application_df
us_application_df_merge['merge_key'] = us_application_df['year'].astype(str) + us_application_df['country_of_origin']+us_application_df['country_of_asylum']
us_application_df_merge

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_application_df_merge['merge_key'] = us_application_df['year'].astype(str) + us_application_df['country_of_origin']+us_application_df['country_of_asylum']


Unnamed: 0,year,country_of_origin,country_of_asylum,applied,merge_key
6112,2017,Albania,United States of America,5,2017AlbaniaUnited States of America
6113,2017,Egypt,United States of America,29,2017EgyptUnited States of America
6114,2017,Argentina,United States of America,5,2017ArgentinaUnited States of America
6115,2017,Bangladesh,United States of America,5,2017BangladeshUnited States of America
6116,2017,Brazil,United States of America,52,2017BrazilUnited States of America
...,...,...,...,...,...
77155,2005,Yemen,United States of America,13,2005YemenUnited States of America
77156,2005,Zambia,United States of America,16,2005ZambiaUnited States of America
77157,2005,Zimbabwe,United States of America,222,2005ZimbabweUnited States of America
77158,2005,Stateless,United States of America,83,2005StatelessUnited States of America


### Transform decision DataFrame

In [27]:
decision_df.shape

(82518, 10)

In [28]:
decision_df["Country of asylum"].value_counts()

United States of America                                5767
Sweden                                                  4207
Germany                                                 4065
France                                                  3459
United Kingdom of Great Britain and Northern Ireland    3451
                                                        ... 
Vanuatu                                                    1
Samoa                                                      1
Grenada                                                    1
Viet Nam                                                   1
Lao People's Dem. Rep.                                     1
Name: Country of asylum, Length: 183, dtype: int64

In [29]:
new_decision_df = decision_df[['Year', 'Country of origin', 'Country of asylum', 'Recognized decisions', 'Complementary protection','Rejected decisions','Otherwise closed','Total decisions']].copy()
new_decision_transformed_df = new_decision_df.rename(columns={"Year": "year",
                                                          "Country of origin": "country_of_origin",
                                                          "Country of asylum": "country_of_asylum",
                                                          "Recognized decisions": "recognized_decisions",
                                                          "Complementary protection": "complementary_protection",
                                                          "Rejected decisions": "rejected_decisions",
                                                          "Otherwise closed": "otherwise_closed",
                                                          "Total decisions": "total_decisions"
                                                         })

new_decision_transformed_df.head()

Unnamed: 0,year,country_of_origin,country_of_asylum,recognized_decisions,complementary_protection,rejected_decisions,otherwise_closed,total_decisions
0,2000,Afghanistan,Australia,38,0,25,0,63
1,2000,Albania,Australia,5,0,23,0,28
2,2000,Algeria,Australia,5,0,15,0,20
3,2000,Egypt,Australia,20,0,49,5,74
4,2000,Armenia,Australia,0,0,5,0,5


In [30]:
us_decision_df = new_decision_transformed_df.loc[new_decision_transformed_df["country_of_asylum"] == "United States of America"]

us_decision_df.reset_index(drop=True)

Unnamed: 0,year,country_of_origin,country_of_asylum,recognized_decisions,complementary_protection,rejected_decisions,otherwise_closed,total_decisions
0,2017,Afghanistan,United States of America,5,0,0,5,10
1,2017,Albania,United States of America,38,0,0,19,57
2,2017,Angola,United States of America,5,0,0,0,5
3,2017,Egypt,United States of America,104,0,5,58,167
4,2017,Armenia,United States of America,15,0,5,18,38
...,...,...,...,...,...,...,...,...
5762,2020,Venezuela (Bolivarian Republic of),United States of America,1356,0,3118,1157,5631
5763,2020,Yemen,United States of America,23,0,42,42,107
5764,2020,Zambia,United States of America,5,0,5,0,10
5765,2020,Zimbabwe,United States of America,27,0,35,23,85


In [31]:
us_decision_df["country_of_origin"].value_counts()

Guatemala                 46
Dem. Rep. of the Congo    46
Cameroon                  46
Nicaragua                 46
Uzbekistan                46
                          ..
Cayman Islands             1
Marshall Islands           1
Norway                     1
Cyprus                     1
Botswana                   1
Name: country_of_origin, Length: 184, dtype: int64

In [70]:
us_decision_df_merge = us_decision_df
us_decision_df_merge ['merge_key'] = us_decision_df['year'].astype(str) + us_decision_df['country_of_origin']+us_decision_df['country_of_asylum']
us_decision_df_merge.sort(['year'], descending = True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  us_decision_df_merge ['merge_key'] = us_decision_df['year'].astype(str) + us_decision_df['country_of_origin']+us_decision_df['country_of_asylum']


AttributeError: 'DataFrame' object has no attribute 'sort'

### Merge Application and Decision DataFrames

In [73]:
# Still needs some edits
merged_df = pd.merge(us_application_df_merge,us_decision_df_merge, how='inner')
merged_df

Unnamed: 0,year,country_of_origin,country_of_asylum,applied,merge_key,recognized_decisions,complementary_protection,rejected_decisions,otherwise_closed,total_decisions
0,2017,Albania,United States of America,5,2017AlbaniaUnited States of America,38,0,0,19,57
1,2017,Albania,United States of America,5,2017AlbaniaUnited States of America,88,0,37,35,160
2,2017,Albania,United States of America,5,2017AlbaniaUnited States of America,13,0,56,64,133
3,2017,Albania,United States of America,181,2017AlbaniaUnited States of America,38,0,0,19,57
4,2017,Albania,United States of America,181,2017AlbaniaUnited States of America,88,0,37,35,160
...,...,...,...,...,...,...,...,...,...,...
11673,2005,Turkmenistan,United States of America,59,2005TurkmenistanUnited States of America,20,0,28,5,53
11674,2005,Zambia,United States of America,16,2005ZambiaUnited States of America,10,0,10,10,30
11675,2005,Zambia,United States of America,16,2005ZambiaUnited States of America,5,0,12,5,22
11676,2005,Unknown,United States of America,260,2005Unknown United States of America,5,0,17,0,22


### Create database connection

In [33]:
connection_string = "postgres:postgres@localhost:5432/asylum_db"
engine = create_engine(f'postgresql://{connection_string}')

In [34]:
# Confirm tables
engine.table_names()

  engine.table_names()


OperationalError: (psycopg2.OperationalError) FATAL:  password authentication failed for user "postgres"

(Background on this error at: http://sqlalche.me/e/14/e3q8)

### Load DataFrames into database

In [None]:
us_application_df.to_sql(name='application', con=engine, if_exists='append', index=True)

In [None]:
us_decision_df.to_sql(name='decision', con=engine, if_exists='append', index=True)