<a href="https://colab.research.google.com/github/annab0503/DS4002/blob/main/Project%201/Analysis%20Data/data_cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import pandas as pd
import os
import requests

## Visa Data Frame

### Step 1: Load the Excel file from the URL

In [2]:
url = 'https://github.com/annab0503/DS4002/blob/main/Project%201/Original%20Data/FYs97-14_NIVDetailTable.xls?raw=true'
response = requests.get(url)
excel_file_path = '/content/FYs97-14_NIVDetailTable.xls'

# Save the file locally
with open(excel_file_path, 'wb') as f:
    f.write(response.content)

# Load the Excel file
excel_data = pd.ExcelFile(excel_file_path)

# Directory to save the CSV files
csv_directory = '/content/csv_files'
os.makedirs(csv_directory, exist_ok=True)  # Ensure directory exists

### Step 2: Iterate through each sheet in the Excel file and save as CSV

In [3]:
for sheet_name in excel_data.sheet_names:
    # Load the sheet into a DataFrame
    df = excel_data.parse(sheet_name)

    # Define the output CSV file name
    csv_file = os.path.join(csv_directory, f'{sheet_name}.csv')

    # Save the DataFrame as a CSV file
    df.to_csv(csv_file, index=False)
    print(f"Saved sheet '{sheet_name}' to {csv_file}")

Saved sheet 'FY97' to /content/csv_files/FY97.csv
Saved sheet 'FY98' to /content/csv_files/FY98.csv
Saved sheet 'FY99' to /content/csv_files/FY99.csv
Saved sheet 'FY00' to /content/csv_files/FY00.csv
Saved sheet 'FY01' to /content/csv_files/FY01.csv
Saved sheet 'FY02' to /content/csv_files/FY02.csv
Saved sheet 'FY03' to /content/csv_files/FY03.csv
Saved sheet 'FY04' to /content/csv_files/FY04.csv
Saved sheet 'FY05' to /content/csv_files/FY05.csv
Saved sheet 'FY06' to /content/csv_files/FY06.csv
Saved sheet 'FY07' to /content/csv_files/FY07.csv
Saved sheet 'FY08' to /content/csv_files/FY08.csv
Saved sheet 'FY09' to /content/csv_files/FY09.csv
Saved sheet 'FY10' to /content/csv_files/FY10.csv
Saved sheet 'FY11' to /content/csv_files/FY11.csv
Saved sheet 'FY12' to /content/csv_files/FY12.csv
Saved sheet 'FY13' to /content/csv_files/FY13.csv
Saved sheet 'FY14' to /content/csv_files/FY14.csv


### Step 3: Process each CSV file

In [4]:

csv_files = [f for f in os.listdir(csv_directory) if f.endswith('.csv')]

# Loop over each CSV file for processing
for csv_file in csv_files:
    # Load the CSV file into a DataFrame
    year_data = pd.read_csv(os.path.join(csv_directory, csv_file))

    # Drop empty rows
    year_data = year_data.dropna(how='all')

    # Rename the 'Fiscal Year' column to 'Country'
    fiscal_year_column = next((col for col in year_data.columns if 'Fiscal Year' in col), None)
    if fiscal_year_column:
        year_data.rename(columns={fiscal_year_column: 'Country'}, inplace=True)

    # Remove rows for continents and totals
    countries_to_drop = ['Africa', 'Asia', 'Europe', 'North America', 'Oceania', 'South America', 'Unknown']
    year_data = year_data[~year_data['Country'].isin(countries_to_drop)]
    year_data = year_data[~year_data['Country'].str.contains('Totals', na=False)]

    # Reshape the DataFrame with 'melt'
    year_data = year_data.melt(id_vars=['Country'],
                                var_name='Type of U.S. Visa',
                                value_name='Quantity of U.S Visas Granted')

    # Extract year from the filename (e.g., 'FY1997.csv')
    year = int(csv_file.split('FY')[1].split('.csv')[0])
    year_data['Fiscal Year'] = year

    # Save the reshaped DataFrame as a new CSV file
    output_file = os.path.join(csv_directory, f"processed_{csv_file}")
    year_data.to_csv(output_file, index=False)

    print(f"Processed {csv_file} and saved to {output_file}")

Processed FY11.csv and saved to /content/csv_files/processed_FY11.csv
Processed FY09.csv and saved to /content/csv_files/processed_FY09.csv
Processed FY08.csv and saved to /content/csv_files/processed_FY08.csv
Processed FY07.csv and saved to /content/csv_files/processed_FY07.csv
Processed FY04.csv and saved to /content/csv_files/processed_FY04.csv
Processed FY13.csv and saved to /content/csv_files/processed_FY13.csv
Processed FY03.csv and saved to /content/csv_files/processed_FY03.csv
Processed FY99.csv and saved to /content/csv_files/processed_FY99.csv
Processed FY01.csv and saved to /content/csv_files/processed_FY01.csv
Processed FY02.csv and saved to /content/csv_files/processed_FY02.csv
Processed FY10.csv and saved to /content/csv_files/processed_FY10.csv
Processed FY00.csv and saved to /content/csv_files/processed_FY00.csv
Processed FY98.csv and saved to /content/csv_files/processed_FY98.csv
Processed FY06.csv and saved to /content/csv_files/processed_FY06.csv
Processed FY14.csv a

### Step 4: Merge all processed CSV files into one DataFrame

In [73]:
processed_csv_files = [f for f in os.listdir(csv_directory) if f.startswith('processed_') and f.endswith('.csv')]

# List to hold DataFrames for merging
dfs = []

# Load each processed CSV file into a DataFrame and append to the list
for csv_file in processed_csv_files:
    df = pd.read_csv(os.path.join(csv_directory, csv_file))
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_csv_file = os.path.join(csv_directory, 'merged_data.csv')
merged_df.to_csv(merged_csv_file, index=False)

print(f"All processed CSV files merged and saved to {merged_csv_file}")

All processed CSV files merged and saved to /content/csv_files/merged_data.csv


 ### Step 5: Load the merged data

In [74]:
visa_data = pd.read_csv('/content/csv_files/merged_data.csv')

# Convert 'Year' column to numeric, coercing errors to NaN
visa_data['Fiscal Year'] = pd.to_numeric(visa_data['Fiscal Year'], errors='coerce')

# Check and display the number of non-numeric values in 'Year'
print(visa_data['Fiscal Year'].isna().sum(), "non-numeric values converted to NaN")

0 non-numeric values converted to NaN


### Step 6: Map fiscal year codes to actual years

In [75]:
code_to_year_map = {
    97: 1997, 98: 1998, 99: 1999, 0: 2000, 1: 2001, 2: 2002, 3: 2003, 4: 2004, 5: 2005,
    6: 2006, 7: 2007, 8: 2008, 9: 2009, 10: 2010, 11: 2011, 12: 2012, 13: 2013, 14: 2014
}

# Apply the mapping to the 'Year' column to convert codes to actual years
visa_data['Fiscal Year'] = visa_data['Fiscal Year'].map(code_to_year_map)

### Step 7: Export the merged and cleaned data frame

In [76]:
visa_data.to_csv('visa_data.csv', index=False)

## GDP Data Frame

### Step 1: Load the Excel file from the URL

In [77]:
url = 'https://github.com/annab0503/DS4002/blob/main/Project%201/Original%20Data/WITS-Country-Timeseries.xlsx?raw=true'
response = requests.get(url)
excel_file_path = '/WITS-Country-Timeseries.xlsx'

# Save the file locally
with open(excel_file_path, 'wb') as f:
    f.write(response.content)

# Load the Excel file
excel_data = pd.ExcelFile(excel_file_path)

# Load a specific sheet (e.g., the first sheet) into a DataFrame
df = excel_data.parse('Country-Timeseries')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Country Name,Indicator Name,1988,1989,1990,1991,1992,1993,1994,1995,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Afghanistan,GDP (current US$),,,,,,,,,...,20564490000.0,20550580000.0,19998140000.0,18019550000.0,18896350000.0,18418860000.0,18904500000.0,20143450000.0,14583140000.0,
1,Albania,GDP (current US$),2051236000.0,2253090000.0,2028554000.0,1099559000.0,652175000.0,1185315000.0,1880951000.0,2392765000.0,...,12776220000.0,13228150000.0,11386850000.0,11861200000.0,13019730000.0,15156420000.0,15401830000.0,15162730000.0,17930570000.0,18882100000.0
2,Algeria,GDP (current US$),59089400000.0,55634720000.0,62048510000.0,45715680000.0,48003130000.0,49945590000.0,42543180000.0,41764290000.0,...,209755000000.0,213810000000.0,165979200000.0,160034200000.0,170097000000.0,174910900000.0,171760300000.0,145743700000.0,163472200000.0,191912900000.0
3,Andorra,GDP (current US$),721426000.0,795489500.0,1028989000.0,1106891000.0,1209993000.0,1007091000.0,1017545000.0,1178745000.0,...,3193513000.0,3271686000.0,2789881000.0,2896610000.0,3000162000.0,3218420000.0,3155149000.0,2891001000.0,3325145000.0,3352033000.0
4,Angola,GDP (current US$),8769837000.0,10201780000.0,11229520000.0,12704560000.0,15114350000.0,11051940000.0,3390500000.0,5561222000.0,...,133401600000.0,137244400000.0,87219300000.0,49840490000.0,68972770000.0,77792940000.0,69309110000.0,50241370000.0,65685440000.0,106713600000.0


### Step 2: Process the data frame

In [78]:
df = df.melt(id_vars=["Country Name", "Indicator Name"],
                    var_name="Fiscal Year",
                    value_name="GDP")

df = df.drop(columns=["Indicator Name"])
df['Fiscal Year'] = pd.to_numeric(df['Fiscal Year'], errors='coerce')

df = df[df['Fiscal Year'].between(1997, 2014)]

df = df.dropna(subset=["GDP"])

# Rename 'Country Name' column to 'Country'
df.rename(columns={'Country Name': 'Country'}, inplace=True)

# Display the first few rows of the DataFrame
df

Unnamed: 0,Country,Fiscal Year,GDP
1738,Albania,1997,2.258514e+09
1739,Algeria,1997,4.817761e+10
1740,Andorra,1997,1.180646e+09
1741,Angola,1997,7.648377e+09
1742,Antigua and Barbuda,1997,6.806185e+08
...,...,...,...
5206,Venezuela,2014,4.823593e+11
5207,Vietnam,2014,2.334515e+11
5208,Yemen,2014,4.322859e+10
5209,Zambia,2014,2.714102e+10


### Step 3: Export the cleaned data frame

In [79]:
df = df.to_csv('/content/gdp_data.csv', index=False)

## Population Data

### Step 1: Load the Excel file from the URL

In [80]:
url = 'https://github.com/annab0503/DS4002/blob/main/Project%201/Original%20Data/IDB_01-02-2025.xlsx?raw=true'
response = requests.get(url)
excel_file_path = '/IDB_01-02-2025.xlsx'

# Save the file locally
with open(excel_file_path, 'wb') as f:
    f.write(response.content)

# Load the Excel file
df = pd.read_excel(excel_file_path)

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,Name,GENC,Year,Population,Annual Growth Rate,Rate of Natural Increase,Population Density,Crude Birth Rate,Net Migration Rate,"Net International Migrants, Both Sexes"
0,-> 1997,,,--,--,--,--,--,--,--
1,Afghanistan,AF,1997.0,21514488,3.76,3.77,33,50.6,-0.1,-2438
2,Albania,AL,1997.0,3225413,-0.01,1.35,117.7,19.5,-13.6,-43782
3,Algeria,DZ,1997.0,29318185,1.6,1.71,12.3,22.3,-1.2,-33826
4,American Samoa,AS,1997.0,55918,--,--,282.4,--,--,--


### Step 2: Process the Data Frame

In [81]:
# Drop unnecessary rows and columns
df.dropna(subset=['Year'], inplace=True)
df = df.drop(columns=['GENC'])

# Define the columns to process
numeric_columns = [
    'Population',
    'Net International Migrants, Both Sexes',
    'Annual Growth Rate',
    'Rate of Natural Increase',
    'Population Density',
    'Crude Birth Rate',
    'Net Migration Rate'
]

# Process the columns
for col in numeric_columns:
    if col == 'Population' or col == 'Net International Migrants, Both Sexes':
        # For columns with commas, remove them, handle non-numeric values, and convert to integer
        df[col] = pd.to_numeric(df[col].str.replace(',', '', regex=False), errors='coerce').fillna(0).astype(int)
    else:
        # For other numeric columns, convert to float
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0).astype(float)

# Convert 'Year' column to integer
df['Year'] = df['Year'].astype(int)

# Rename the columns
df.rename(columns={'Name': 'Country'}, inplace=True)
df.rename(columns={'Year': 'Fiscal Year'}, inplace=True)

# Drop rows where 'Fiscal Year' = 2025
df = df[df['Fiscal Year'] != 2025]

df

Unnamed: 0,Country,Fiscal Year,Population,Annual Growth Rate,Rate of Natural Increase,Population Density,Crude Birth Rate,Net Migration Rate,"Net International Migrants, Both Sexes"
1,Afghanistan,1997,21514488,3.76,3.77,33.0,50.6,-0.1,-2438
2,Albania,1997,3225413,-0.01,1.35,117.7,19.5,-13.6,-43782
3,Algeria,1997,29318185,1.60,1.71,12.3,22.3,-1.2,-33826
4,American Samoa,1997,55918,0.00,0.00,282.4,0.0,0.0,0
5,Andorra,1997,64053,1.23,0.67,136.9,11.4,5.7,0
...,...,...,...,...,...,...,...,...,...
4099,Wallis and Futuna,2014,15527,0.32,0.84,109.3,13.6,-5.3,0
4100,West Bank,2014,2613511,2.11,2.58,463.4,29.4,-4.7,-12271
4101,Yemen,2014,26734619,2.60,2.75,50.6,34.2,-1.6,-41570
4102,Zambia,2014,15433003,3.09,3.07,20.8,38.1,0.1,1858


In [82]:
# Create a new row for Palestine by grouping Gaza Strip and West Bank
palestine_row = df[df['Country'].isin(['Gaza Strip', 'West Bank'])]

# Define the columns to sum and average
sum_columns = ['Population']
avg_columns = [
    'Annual Growth Rate',
    'Rate of Natural Increase',
    'Population Density',
    'Crude Birth Rate',
    'Net Migration Rate',
    'Net International Migrants, Both Sexes'
]

# Sum the columns that need to be summed
palestine_row_sum = palestine_row.groupby('Fiscal Year')[sum_columns].sum()

# Average the columns that need to be averaged
palestine_row_avg = palestine_row.groupby('Fiscal Year')[avg_columns].mean()

# Combine the sum and average results into one DataFrame
palestine_row = pd.concat([palestine_row_sum, palestine_row_avg], axis=1)

# Add 'Country' as 'Palestine' to the new row
palestine_row['Country'] = 'Palestine'

# Reset the index so it's a DataFrame
palestine_row = palestine_row.reset_index()

# Concatenate the Palestine row to the DataFrame
df = pd.concat([df, palestine_row], ignore_index=True)

df

Unnamed: 0,Country,Fiscal Year,Population,Annual Growth Rate,Rate of Natural Increase,Population Density,Crude Birth Rate,Net Migration Rate,"Net International Migrants, Both Sexes"
0,Afghanistan,1997,21514488,3.760,3.770,33.00,50.60,-0.10,-2438.0
1,Albania,1997,3225413,-0.010,1.350,117.70,19.50,-13.60,-43782.0
2,Algeria,1997,29318185,1.600,1.710,12.30,22.30,-1.20,-33826.0
3,American Samoa,1997,55918,0.000,0.000,282.40,0.00,0.00,0.0
4,Andorra,1997,64053,1.230,0.670,136.90,11.40,5.70,0.0
...,...,...,...,...,...,...,...,...,...
4099,Palestine,2010,3918857,2.685,3.005,213.55,33.70,-3.20,-7135.5
4100,Palestine,2011,4018381,2.620,2.935,217.90,32.95,-3.15,-7135.5
4101,Palestine,2012,4118367,2.555,2.855,222.35,32.15,-3.05,-7135.5
4102,Palestine,2013,4219915,2.540,2.840,226.90,31.90,-3.00,-7135.5


### Step 3: Export the cleaned data frame

In [83]:
df = df.to_csv('/content/pop_data.csv', index=False)

## Merged Data Frame

### Step 1: Load the cleaned data frames

In [84]:
# Load the GDP data and visa data
gdp_data = pd.read_csv('gdp_data.csv')
visa_data = pd.read_csv('visa_data.csv')
pop_data = pd.read_csv('pop_data.csv')

# Ensure the columns 'Country' and 'Year' are present and consistent in both DataFrames
print("Columns in GDP Data:", gdp_data.columns)
print("Columns in Visa Data:", visa_data.columns)
print("Columns in Population Data:", pop_data.columns)

# Standardize column names if necessary (e.g., trimming whitespaces or renaming)
gdp_data.rename(columns=lambda x: x.strip(), inplace=True)
visa_data.rename(columns=lambda x: x.strip(), inplace=True)
pop_data.rename(columns=lambda x: x.strip(), inplace=True)

Columns in GDP Data: Index(['Country', 'Fiscal Year', 'GDP'], dtype='object')
Columns in Visa Data: Index(['Country', 'Type of U.S. Visa', 'Quantity of U.S Visas Granted',
       'Fiscal Year'],
      dtype='object')
Columns in Population Data: Index(['Country', 'Fiscal Year', 'Population', 'Annual Growth Rate',
       'Rate of Natural Increase', 'Population Density', 'Crude Birth Rate',
       'Net Migration Rate', 'Net International Migrants, Both Sexes'],
      dtype='object')


### Step 2: Troubleshoot unmatched data

In [91]:
# Identify unique values of 'Country' in the unmatched_data and sort alphabetically
unique_countries1 = sorted(visa_data['Country'].unique())
unique_countries2 = sorted(gdp_data['Country'].unique())
unique_countries3 = sorted(pop_data['Country'].unique())

In [92]:
# Convert the lists of unique countries into sets
set_countries1 = set(unique_countries1)
set_countries2 = set(unique_countries2)
set_countries3 = set(unique_countries3)

In [93]:
missing_in_visa = set_countries1 - set_countries2 - set_countries3
missing_in_visa

{'China - Taiwan',
 'China - mainland',
 'Congo, Dem. Rep. of the (Congo Kinshasa)',
 'Congo, Dem. Rep. of the (Kinshasa)',
 'Congo, Rep. of the (Brazzaville)',
 'Congo, Rep. of the (Congo Brazzaville)',
 "Cote d'Ivoire ",
 'Great Britain and Northern Ireland',
 'Hong Kong S.A.R.',
 'Macau S.A.R.',
 'Macedonia',
 'No Nationality',
 'Palestinian Authority Travel Document',
 'Serbia  ',
 'Serbia and Montenegro',
 'Swaziland',
 'United Nations Laissez-Passer',
 'Vatican City'}

In [97]:
# Rename Countries in Visa data frame for consistency
visa_data.loc[visa_data['Country'] == 'China - Taiwan', 'Country'] = 'Taiwan'
visa_data.loc[visa_data['Country'] == 'China - mainland', 'Country'] = 'China'
visa_data.loc[visa_data['Country'] == 'Congo, Dem. Rep. of the (Congo Kinshasa)', 'Country'] = 'Democratic Republic of the Congo'
visa_data.loc[visa_data['Country'] == 'Congo, Dem. Rep. of the (Kinshasa)', 'Country'] = 'Democratic Republic of the Congo'
visa_data.loc[visa_data['Country'] == 'Congo, Rep. of the (Congo Brazzaville)', 'Country'] = 'Republic of the Congo'
visa_data.loc[visa_data['Country'] == 'Congo, Rep. of the (Brazzaville)', 'Country'] = 'Republic of the Congo'
visa_data.loc[visa_data['Country'] == "Cote d'Ivoire ", 'Country'] = "Cote d'Ivoire"
visa_data.loc[visa_data['Country'] == 'Great Britain and Northern Ireland', 'Country'] = 'United Kingdom'
visa_data.loc[visa_data['Country'] == 'Hong Kong S.A.R.', 'Country'] = 'Hong Kong'
visa_data.loc[visa_data['Country'] == 'Macau S.A.R.', 'Country'] = 'Macau'
visa_data.loc[visa_data['Country'] == 'Kyrgyzstan', 'Country'] = 'Kyrgyz Republic'
visa_data.loc[visa_data['Country'] == 'Macedonia', 'Country'] = 'North Macedonia'
visa_data.loc[visa_data['Country'] == 'Palestinian Authority Travel Document', 'Country'] = 'Palestine'
visa_data.loc[visa_data['Country'] == 'Serbia  ', 'Country'] = 'Serbia'
visa_data.loc[visa_data['Country'] == 'Swaziland', 'Country'] = 'Eswatini (Swaziland)'

In [94]:
missing_in_gdp = set_countries2 - set_countries1 - set_countries3
missing_in_gdp

{'Congo, Dem. Rep.',
 'Congo, Rep.',
 'East Timor',
 'Egypt, Arab Rep.',
 'Ethiopia(excludes Eritrea)',
 'Faeroe Islands',
 'Fm Sudan',
 'Hong Kong, China',
 'Iran, Islamic Rep.',
 'Korea, Rep.',
 'Kyrgyz Republic',
 'Lao PDR',
 'Macao',
 'Micronesia, Fed. Sts.',
 'Myanmar',
 'Occ.Pal.Terr',
 'Russian Federation',
 'Serbia, FR(Serbia/Montenegro)',
 'Slovak Republic',
 'St. Kitts and Nevis',
 'St. Lucia',
 'St. Vincent and the Grenadines',
 'Syrian Arab Republic',
 'Turks and Caicos Isl.'}

In [101]:
# Rename Countries in GDP data frame for consistency
gdp_data.loc[gdp_data['Country'] == 'Congo, Dem. Rep.', 'Country'] = 'Democratic Republic of the Congo'
gdp_data.loc[gdp_data['Country'] == 'Congo, Rep.', 'Country'] = 'Republic of the Congo'
gdp_data.loc[gdp_data['Country'] == 'East Timor', 'Country'] = 'Timor-Leste'
gdp_data.loc[gdp_data['Country'] == 'Egypt, Arab Rep.', 'Country'] = 'Egypt'
gdp_data.loc[gdp_data['Country'] == 'Ethiopia(excludes Eritrea)', 'Country'] = 'Ethiopia'
gdp_data.loc[gdp_data['Country'] == 'Fm Sudan', 'Country'] = 'Sudan'
gdp_data.loc[gdp_data['Country'] == 'Hong Kong, China', 'Country'] = 'Hong Kong'
gdp_data.loc[gdp_data['Country'] == 'Iran, Islamic Rep.', 'Country'] = 'Iran'
gdp_data.loc[gdp_data['Country'] == 'Korea, Rep.', 'Country'] = 'Korea, South'
gdp_data.loc[gdp_data['Country'] == 'Lao PDR', 'Country'] = 'Laos'
gdp_data.loc[gdp_data['Country'] == 'Macao', 'Country'] = 'Macau'
gdp_data.loc[gdp_data['Country'] == 'Micronesia, Fed. Sts.', 'Country'] = 'Micronesia'
gdp_data.loc[gdp_data['Country'] == 'Myanmar', 'Country'] = 'Burma'
gdp_data.loc[gdp_data['Country'] == 'Myanmar', 'Country'] = 'Palestine'
gdp_data.loc[gdp_data['Country'] == 'Russian Federation', 'Country'] = 'Russia'
gdp_data.loc[gdp_data['Country']== 'Serbia, FR(Serbia/Montenegro)', 'Country'] = 'Serbia and Montenegro'
gdp_data.loc[gdp_data['Country'] == 'Slovak Republic', 'Country'] = 'Slovakia'
gdp_data.loc[gdp_data['Country'] == 'St. Kitts and Nevis', 'Country'] = 'Saint Kitts and Nevis'
gdp_data.loc[gdp_data['Country']== 'St. Lucia', 'Country'] = 'Saint Lucia'
gdp_data.loc[gdp_data['Country'] == 'St. Vincent and the Grenadines', 'Country'] = 'Saint Vincent and the Grenadines'
gdp_data.loc[gdp_data['Country'] == 'Syrian Arab Republic', 'Country'] = 'Syria'

In [95]:
missing_in_pop = set_countries3 - set_countries1 - set_countries2
missing_in_pop

{'American Samoa',
 'Anguilla',
 'Congo (Brazzaville)',
 'Congo (Kinshasa)',
 'Cook Islands',
 'Curaçao',
 'Czechia',
 'Côte d’Ivoire',
 'Faroe Islands',
 'Gaza Strip',
 'Gibraltar',
 'Guam',
 'Guernsey',
 'Hong Kong',
 'Isle of Man',
 'Jersey',
 'Macau',
 'Montserrat',
 'Northern Mariana Islands',
 'Palestine',
 'Puerto Rico',
 'Saint Barthelemy',
 'Saint Helena, Ascension, and Tristan da Cunha',
 'Saint Martin',
 'Saint Pierre and Miquelon',
 'Sint Maarten',
 'Taiwan',
 'Turks and Caicos Islands',
 'Virgin Islands, British',
 'Virgin Islands, U.S.',
 'Wallis and Futuna',
 'West Bank'}

In [102]:
# Rename Countries in Population data frame for consistency
pop_data.loc[pop_data['Country'] == 'Congo (Brazzaville)', 'Country'] = 'Republic of the Congo'
pop_data.loc[pop_data['Country'] == 'Congo (Kinshasa)', 'Country'] = 'Democratic Republic of the Congo'
pop_data.loc[pop_data['Country'] == 'Czechia', 'Country'] = 'Czech Republic'
pop_data.loc[pop_data['Country'] == "Côte d’Ivoire", 'Country'] = "Cote d'Ivoire"

### Step 3: Complete merge

In [104]:
# Merge the data frames on 'Country' and 'Fiscal Year' with indicator=True
merged_data = pd.merge(visa_data, gdp_data, on=['Country', 'Fiscal Year'], how='outer', indicator=True)

# Rename the '_merge' column to avoid conflict
merged_data.rename(columns={'_merge': '_merge_gdp'}, inplace=True)

# Merge with the pop_data DataFrame
merged_data = pd.merge(merged_data, pop_data, on=['Country', 'Fiscal Year'], how='outer', indicator=True)

#View merged data
merged_data

Unnamed: 0,Country,Type of U.S. Visa,Quantity of U.S Visas Granted,Fiscal Year,GDP,_merge_gdp,Population,Annual Growth Rate,Rate of Natural Increase,Population Density,Crude Birth Rate,Net Migration Rate,"Net International Migrants, Both Sexes",_merge
0,Afghanistan,A-1,0.0,1997,,left_only,21514488.0,3.76,3.77,33.0,50.6,-0.1,-2438.0,both
1,Afghanistan,A-2,1.0,1997,,left_only,21514488.0,3.76,3.77,33.0,50.6,-0.1,-2438.0,both
2,Afghanistan,A-3,1.0,1997,,left_only,21514488.0,3.76,3.77,33.0,50.6,-0.1,-2438.0,both
3,Afghanistan,B-1,46.0,1997,,left_only,21514488.0,3.76,3.77,33.0,50.6,-0.1,-2438.0,both
4,Afghanistan,"B-1,2",361.0,1997,,left_only,21514488.0,3.76,3.77,33.0,50.6,-0.1,-2438.0,both
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
318130,Zimbabwe,U-4,0.0,2014,1.949552e+10,both,13791770.0,2.08,2.63,35.7,35.4,-5.5,-75299.0,both
318131,Zimbabwe,U-5,0.0,2014,1.949552e+10,both,13791770.0,2.08,2.63,35.7,35.4,-5.5,-75299.0,both
318132,Zimbabwe,Total Visas,7358.0,2014,1.949552e+10,both,13791770.0,2.08,2.63,35.7,35.4,-5.5,-75299.0,both
318133,Zimbabwe,BCC,0.0,2014,1.949552e+10,both,13791770.0,2.08,2.63,35.7,35.4,-5.5,-75299.0,both




### Step 4: Export the merged data frame

In [108]:
merged_data.to_csv('merged_data.csv', index=False)