# Travel and Tourism Reform Project

### Documentation

**Dataframes:** 
- df_qcontcust_2009_2019 -> contains data on all years between 2009 - 2019
- df_qcontcust_2009, df_qcontcust_2010, ... to df_qcontcust_2019 -> filtered from df_qcontcust_2009_2019 for each year
- df_qcontcust_2022 -> contains data for 2022 
***


## Importing Packages

In [1]:

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json

import statsmodels.api as sm
from statsmodels.sandbox.stats.multicomp import multipletests

import scipy.stats as ss
from scipy.stats import kruskal
from scipy.stats import mannwhitneyu
from scipy.stats import chi2_contingency

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from scikit_posthocs import posthoc_dunn

from itertools import product

from imblearn.over_sampling import RandomOverSampler

from tabulate import tabulate

import warnings
warnings.filterwarnings("ignore")

## Loading Data

In [2]:
df_qcontcust_2009_2019 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2013-UKDA-7380-tab\\tab\\qcontcust_2009_2019.tab", delimiter='\t')
#filtering the dataset into different years
df_qcontcust_2009 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2009]
df_qcontcust_2010 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2010]
df_qcontcust_2011 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2011]
df_qcontcust_2012 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2012]
df_qcontcust_2013 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2013]
df_qcontcust_2014 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2014]
df_qcontcust_2015 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2015]
df_qcontcust_2016 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2016]
df_qcontcust_2017 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2017]
df_qcontcust_2018 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2018]
df_qcontcust_2019 = df_qcontcust_2009_2019[df_qcontcust_2009_2019['Year'] == 2019]
df_qcontcust_2022 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2022-UKDA-9122-tab\\tab\\qcontcust2022.tab", delimiter='\t')


df_qreg_2013 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2013-UKDA-7380-tab\\tab\\qreg_2013.tab", delimiter='\t')
df_qreg_2014 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2014-UKDA-7534-tab\\tab\\qreg_2014.tab", delimiter='\t')
df_qreg_2015 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2015-UKDA-7754-tab\\tab\\qreg_2015.tab", delimiter='\t')
df_qreg_2016 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2016-UKDA-8016-tab\\tab\\qreg_2016.tab", delimiter='\t')
df_qreg_2017 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2017-UKDA-8286-tab\\tab\\qreg_2017.tab", delimiter='\t')
df_qreg_2018 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2018-UKDA-8468-tab\\tab\\qreg_2018.tab", delimiter='\t')
df_qreg_2019 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2019-UKDA-8575-tab\\tab\\qreg_2019.tab", delimiter='\t')
df_qreg_2022 = pd.read_csv("C:\\Users\\medasud\\Downloads\\2022-UKDA-9122-tab\\tab\\qreg_2022.tab", delimiter='\t')
#qreg is not available for 2009-2012


## Creating New Variables from Mappings

In [22]:
#dictionary for flow
flow_dict = {
    1.0: "Air Departure Foreign",
    2.0: "Air Departure UK",
    3.0: "Air Arrival Foreign",
    4.0: "Air Arrival UK",
    5.0: "Sea Departure Foreign",
    6.0: "Sea Departure UK",
    7.0: "Sea Arrival Foreign",
    8.0: "Sea Arrival UK"
}

#function to create Flow_Label column for all years

def create_flow_label_column(df):
    """
    This function creates a new column FLow_Label which is derived from the column Flow
    and an external data dictionary mapping the integer/float values in Flow to their respective 
    values. This is for all years.

    Parameters:
    param1 : the dataframe being manipulated
 
    Returns:
    no return value. When the function is called, the new column is created.
    """
    #fill missing values in Purpose column with -1
    df['Flow'].replace(' ', np.nan, inplace=True)
    df['Flow'].fillna(-1, inplace=True)
    df['Flow'] = df['Flow'].astype(float)
    df['Flow'].replace('-1', np.nan, inplace=True)
    
    df['Flow_Label'] = df['Flow'].map(flow_dict)

#call this function for df_qcontcust of each year
dataframes = [df_qcontcust_2009, df_qcontcust_2010, df_qcontcust_2011, df_qcontcust_2012,
              df_qcontcust_2013, df_qcontcust_2014, df_qcontcust_2015, df_qcontcust_2016,
              df_qcontcust_2017, df_qcontcust_2018, df_qcontcust_2019, df_qcontcust_2022]

#iterate over the list of dataframes and apply the function
for df in dataframes:
    create_flow_label_column(df)

In [40]:
df_qcontcust_2015['Flow_Label'].value_counts()

Flow_Label
Air Departure UK         103740
Air Arrival UK            55851
Air Arrival Foreign       54274
Air Departure Foreign     41071
Sea Departure UK          13884
Sea Arrival Foreign       11296
Sea Arrival UK             9825
Sea Departure Foreign      9146
Name: count, dtype: int64

In [23]:
#create new column Purpose_Label for years 2009-19

#load the mapping from the JSON file
file_path = "C:\\Users\\medasud\\Documents\\Project1\\Purpose_value_map_0919.json"
with open(file_path, 'r') as json_file:
    purpose_mapping_0919 = json.load(json_file)

#function to create Purpose_Label column
def create_purpose_column_0919(df, mapping):
    """
    This function creates a new column Purpose_Label which is derived from the column Purpose
    and an external data dictionary mapping the integer/float values in Purpose to their respective 
    purposes. This is for the years 2009-2019 only.

    Parameters:
    param1 : the dataframe being manipulated
    param2 : the mapping from the json file

    Returns:
    no return value. When the function is called, the new column is created.
    """
    df['Purpose'].replace(' ', pd.NA, inplace=True)
    df['Purpose'].fillna(-1, inplace=True)
    df['Purpose'] = df['Purpose'].astype(float)
    df['Purpose'] = df['Purpose'].astype(str)
    
    # Create a new column "Purpose_Label" by mapping the values
    df['Purpose_Label'] = df['Purpose'].map(mapping)
    df['Purpose'] = df['Purpose'].astype(float)
    
#call this function for df_qcontcust of each year
dataframes = [df_qcontcust_2009, df_qcontcust_2010, df_qcontcust_2011, df_qcontcust_2012,
              df_qcontcust_2013, df_qcontcust_2014, df_qcontcust_2015, df_qcontcust_2016,
              df_qcontcust_2017, df_qcontcust_2018, df_qcontcust_2019]

#iterate over the list of dataframes and apply the function
for df in dataframes:
    create_purpose_column_0919(df, purpose_mapping_0919)


In [24]:
#create new column Purpose_Label for years 2022

#load the mapping from the JSON file
file_path = "C:\\Users\\medasud\\Documents\\Project1\\Purpose_value_map_22.json"
with open(file_path, 'r') as json_file:
    purpose_mapping_22 = json.load(json_file)
    
#function to create Purpose_Label column
def create_purpose_column_22(df, mapping):
    """
    This function creates a new column Purpose_Label which is derived from the column Purpose
    and an external data dictionary mapping the integer/float values in Purpose to their respective 
    purposes. This is for the years 2022 only as the Purpose codes are different for this year.
    Purpose codes for subsequent years are likely to remain the same, and in that case, this function
    can be reused.

    Parameters:
    param1 : the dataframe being manipulated
    param2 : the mapping from the json file

    Returns:
    no return value. When the function is called, the new column is created.
    """
    
    df['Purpose'].replace(' ', pd.NA, inplace=True)
    df['Purpose'].fillna(-1, inplace=True)
    df['Purpose'] = df['Purpose'].astype(float)
    df['Purpose'] = df['Purpose'].astype(str)
    
    # Create a new column "Purpose_Label" by mapping the values
    df['Purpose_Label'] = df['Purpose'].map(mapping)
    df['Purpose'] = df['Purpose'].astype(float)
    
#call the function
create_purpose_column_22(df_qcontcust_2022, purpose_mapping_22)


In [27]:
df_qcontcust_2022['Purpose_Label'].value_counts()

Purpose_Label
Holiday/pleasure                                               51586
Visit family (priority)                                        29972
Business; Work                                                 11315
Visit friends                                                   4723
Same day transit                                                2970
Overnight transit                                               1372
OTHER                                                           1098
Watch sport                                                     1043
Play amateur sport                                               645
Definite job to go to                                            528
Medical Treatment                                                476
Cruise 0-2 nights ashore - For                                   242
International commuter                                           216
Military or embassy (serving on duty)                            178
First or Foundation 

In [29]:
df_qcontcust_2013['Purpose_Label'].value_counts()

Purpose_Label
Holiday/pleasure                                      50472
Visit family (priority)                               22423
Business; Work                                        18709
Visit friends                                          4677
Same day transit                                       3180
Overnight transit                                      1331
Play amateur sport                                     1157
OTHER                                                  1151
Watch sport                                            1060
Personal shopping                                       743
Other formal study                                      654
Cruise 0-2 nights ashore - For                          408
Definite job to go to                                   345
Accompany / join                                        242
Medical treatment                                       211
Cruise 0-2 nights ashore - UK                           209
Military (serving on duty)

In [74]:
#create new column Nationality_Label for years 2009-19

#load the mapping from the JSON file
file_path = "C:\\Users\\medasud\\Documents\\Project1\\Nationality_value_map_0919.json"
with open(file_path, 'r') as json_file:
    nationality_mapping_0919 = json.load(json_file)

#function to create Nationality_Label column for 2009-2019
def create_nationality_label_column_0919(df):
    """
    This function creates a new column Nationality_Label which is derived from the column Nationality
    and an external data dictionary mapping the integer/float values in Nationality to their respective 
    purposes. This is for the years 2009-2019 only.

    Parameters:
    param1 : the dataframe being manipulated.

    Returns:
    return value. When the function is called, the new column is created.
    """
    df['Nationality'].replace(' ', np.nan, inplace=True)
    df['Nationality'].fillna(-1, inplace=True)
    #changing to the datatype to str to facilitate mapping as the JSON file has the key as type string by default
    df['Nationality'] = df['Nationality'].astype(str)
    
    #create a new column Nationality_Label by mapping the values
    df['Nationality_Label'] = df['Nationality'].map(nationality_mapping_0919)
    df['Nationality_Label'].fillna(("Unknown"), inplace=True)
    df['Nationality'] = df['Nationality'].astype(float) 

#call this function for df_qcontcust of each year
dataframes = [df_qcontcust_2009, df_qcontcust_2010, df_qcontcust_2011, df_qcontcust_2012,
              df_qcontcust_2013, df_qcontcust_2014, df_qcontcust_2015, df_qcontcust_2016,
              df_qcontcust_2017, df_qcontcust_2018, df_qcontcust_2019]

#iterate over the list of dataframes and apply the function for 2009-2019
for df in dataframes:
    create_nationality_label_column_0919(df)


In [75]:
#create new column Nationality_Label for years 2022 onwards

#load the mapping from the JSON file
file_path = "C:\\Users\\medasud\\Documents\\Project1\\Nationality_value_map_22.json"
with open(file_path, 'r') as json_file:
    nationality_mapping_22 = json.load(json_file)
    
#function to create Nationality_Label column for 2022 onwards
def create_nationality_label_column_22(df):
    """
    This function creates a new column Nationality_Label which is derived from the column Nationality
    and an external data dictionary mapping the integer/float values in Nationality to their respective 
    purposes for the year 2022 and onwards.
    
    Parameters:
    param1 : the dataframe being manipulated.

    Returns:
    no return value. When the function is called, the new column is created.
    """
    df['Nationality'].replace(' ', np.nan, inplace=True)
    df['Nationality'].fillna(-1, inplace=True)
    #changing to the datatype to str to facilitate mapping as the JSON file has the key as type string by default
    df['Nationality'] = df['Nationality'].astype(str)
    
    #create a new column Nationality_Label by mapping the values
    df['Nationality_Label'] = df['Nationality'].map(nationality_mapping_22)
    df['Nationality_Label'].fillna(("Unknown"), inplace=True)
    df['Nationality'] = df['Nationality'].astype(float) 

#call this function for df_qcontcust_2022
for df in dataframes:
    create_nationality_label_column_22(df_qcontcust_2022)


In [76]:
df_qcontcust_2022['Nationality_Label'].value_counts()

Nationality_Label
UK                      157672
USA                      17817
Irish Republic            9862
France/Corsica            8407
Poland                    7766
                         ...  
Papua New Guinea             1
Surinam/Dutch Guiana         1
Gabon                        1
Bhutan                       1
Burkina Faso                 1
Name: count, Length: 211, dtype: int64

In [77]:
df_qcontcust_2017['Nationality_Label'].value_counts()

Nationality_Label
UK/United Kingdom               137118
USA/United States of America     14062
France                            8991
Poland                            8421
Germany                           7605
                                 ...  
Korea, North                         1
San Marino                           1
Lesotho                              1
Guinea - Bissau                      1
Cape Verde Islands                   1
Name: count, Length: 198, dtype: int64

In [46]:
#function to create Stay_Category

def create_stay_category_column(df):
    """
    This function creates a new column Stay_Category which is derived from the column Stay, a numeric continuous variable.
    The Stay_Category column is used to convert Stay into factor levels that can later be used for model training purposes,
    as most of the variables in this dataset are also categorical.

    Parameters:
    param1 : the dataframe being manipulated.

    Returns:
    no return value. When the function is called, the new column is created.
    """
    
    #the variable names are different across the dataframes so we accomodate this
    stay_column_name = next((col for col in ['Stay', 'stay'] if col in df.columns), None)
    if stay_column_name is not None:
        df[stay_column_name].replace(' ', np.nan, inplace=True)
        df[stay_column_name] = pd.to_numeric(df[stay_column_name], errors='coerce')
    
    #we only want to consider stays for less than a year
    #removing outliers in Stay duration, only retaining stays that are less than a year
    df[stay_column_name] = df[stay_column_name].astype(float)
    df[stay_column_name] = df[stay_column_name][df[stay_column_name] <= 365]
    intervals = [1, 3, 13, 27, 90, 180, 365]
    labels = ['1-3 days', '4-13 days', '14-27 days', '1-3 months', '3-6 months', '6-12 months']
    df['Stay_Category'] = pd.cut(df[stay_column_name], bins=intervals, labels=labels)
    
#call this function for df_qcontcust of each year
dataframes = [df_qcontcust_2009, df_qcontcust_2010, df_qcontcust_2011, df_qcontcust_2012,
              df_qcontcust_2013, df_qcontcust_2014, df_qcontcust_2015, df_qcontcust_2016,
              df_qcontcust_2017, df_qcontcust_2018, df_qcontcust_2019, df_qcontcust_2022]

#iterate over the list of dataframes and apply the function for all years
for df in dataframes:
    create_stay_category_column(df)


In [47]:
#function to create Spend_Category

def create_spend_category_column(df):
    """
    This function creates a new column Spend_Category which is derived from the column Spend, a numeric continuous variable.
    The Spend_Category column is used to convert Spend into factor levels that can later be used for model training purposes,
    as most of the variables in this dataset are also categorical.

    Parameters:
    param1 : the dataframe being manipulated.

    Returns:
    no return value. When the function is called, the new column is created.
    """
    #the variable names are different across the dataframes so we accommodate this
    spend_column_name = next((col for col in ['Spend', 'spend'] if col in df.columns), None)
    if spend_column_name is not None:
        df[spend_column_name].replace(' ', np.nan, inplace=True)
        df[spend_column_name] = pd.to_numeric(df[spend_column_name], errors='coerce')

        #we only want to consider expenditure less than 10k as more than that would be outliers
        df[spend_column_name] = df[spend_column_name].astype(float)
        df[spend_column_name] = df[spend_column_name][df[spend_column_name] <= 10000]

        intervals = [0, 250, 500, 1000, 5000, float('inf')]
        labels = ['0-250 GBP', '250-500 GBP', '500-1000 GBP', '1000-5000 GBP', 'more than 5000 GBP']
        df['Spend_Category'] = pd.cut(df[spend_column_name], bins=intervals, labels=labels)

#call this function for df_qcontcust of each year
dataframes = [df_qcontcust_2009, df_qcontcust_2010, df_qcontcust_2011, df_qcontcust_2012,
              df_qcontcust_2013, df_qcontcust_2014, df_qcontcust_2015, df_qcontcust_2016,
              df_qcontcust_2017, df_qcontcust_2018, df_qcontcust_2019, df_qcontcust_2022]

#iterate over the list of dataframes and apply the function for all years
for df in dataframes:
    create_spend_category_column(df)


In [44]:
df_qcontcust_2016['Stay_Category'].value_counts()

Stay_Category
4-13 days      48027
1-3 days       22130
14-27 days     13932
1-3 months      4589
3-6 months       677
6-12 months      223
Name: count, dtype: int64

In [45]:
df_qcontcust_2018['Spend_Category'].value_counts()

Spend_Category
0-250 GBP             21868
250-500 GBP           17910
500-1000 GBP          16606
1000-5000 GBP         11576
more than 5000 GBP      480
Name: count, dtype: int64

In [30]:
#purposes of visit that we are not interested in for years
excluded_purposes = ["International commuter", "Immigrating/Emigrating", "Overnight transit", "Asylum Seeker",
                     "Same day transit", "Military (serving on duty)", "Returning Home To Live", 
                     "Merchant navy (joining or leaving ship)", "Military or embassy (serving on duty)",
                     "Airline crew (positioning)", "Coding query", "Looking for work"]


In [55]:
#create column Broad_Purpose to catgorise the different purposes of visit into 4 main categories

def create_broad_purpose_column(df):
    """
    This function creates a new column Broad_Purpose which is derived from the column Purpose_Label.
    Broad_Purpose catgorises the different purposes of visit into 4 main categories. These purposes will
    be further filtered later in this notebook.
    Broad_Purpose is the main variable of interest in this project.

    Parameters:
    param1 : the dataframe being manipulated.

    Returns:
    no return value. When the function is called, the new column is created.
    """
    #define the conditions for the different purposes
    conditions = [
        df['Purpose_Label'].isin(["Holiday/pleasure", "Getting married", "Play amateur sport",
                                  "Watch sport", "Personal shopping", "Religious Pilgrimage",
                                  "Cruise 0-2 nights ashore - UK", "Olympics/Paralympics Watch",
                                  "Cruise 0-2 nights ashore - For"]),
        df['Purpose_Label'].isin(["Business; Work", "Visit trade fair", "Conference 20+ people",
                                  "Definite job to go to", "Working Holiday", 
                                  "Olympics/Paralympics Participate", "Olympics/Paralympics Work"]),
        df['Purpose_Label'].isin(["Visit family (priority)", "Visit friends"]),
        df['Purpose_Label'].isin(["First/foundation degree", "Higher/PostGrad degree",
                                  "English language course", "Course between school and degree",
                                  "Secondary education", "Professional qualification",
                                  "Other formal study", "University Degree or Diploma",
                                  "Formal course (check residence and definition)", "Formal Course",
                                  "Other Course Below Degree Level & Above Secondary Education",
                                  "English language course (not degree level)", "Au Pair", 
                                  "Medical treatment", "Accompany / join", "OTHER",
                                  "Unacc schoolchild (16 or under, school to parents)", 
                                  "Joining another traveller","Accompany another traveller"]),
        df['Purpose_Label'].isin(excluded_purposes),
        df['Purpose_Label'].isna(),
    ]
    #migrants will be filtered out later, but we will keep them for now
    #map the different purposes to their new labels
    choices = ["Holiday", "Business or Work", "VFF", "Education and Other", "Migrants/In-eligibles", "N/A"]
    #create the column
    df['Broad_Purpose'] = np.select(conditions, choices, default='N/A')
    
    
#call this function for df_qcontcust of each year
dataframes = [df_qcontcust_2009, df_qcontcust_2010, df_qcontcust_2011, df_qcontcust_2012,
              df_qcontcust_2013, df_qcontcust_2014, df_qcontcust_2015, df_qcontcust_2016,
              df_qcontcust_2017, df_qcontcust_2018, df_qcontcust_2019, df_qcontcust_2022]

#iterate over the list of dataframes and apply the function for all years
for df in dataframes:
    create_broad_purpose_column(df)

In [56]:
df_qcontcust_2015['Broad_Purpose'].value_counts()

Broad_Purpose
N/A                      192230
Holiday                   54945
VFF                       27150
Business or Work          18057
Migrants/In-eligibles      4277
Education and Other        2428
Name: count, dtype: int64

In [78]:
#create Broad_Nationality column
#define the countries for each broad nationality

north_america = ["Canada", "Haiti", "Mexico", "USA", "US Virgin Isles", "USA/United States of America"]  

south_and_central_america = ["Argentina", "Bahamas", "Barbados", "Barbuda/Antigua", "Belize", "Bolivia", "Brazil", "Chile", 
                             "Colombia", "Costa Rica", "El Salvador", "Guatemala", "Cuba", "Dominican Republic", 
                             "Dominica", "Ecuador", "Guyana", "Honduras", "Jamaica", "Nicaragua", "Panama",
                             "Peru", "Paraguay", "Puerto Rico", "Surinam/Dutch Guiana", "Trinidad & Tobago", 
                             "Uruguay", "Venezuela"]


uk = ["UK", "UK/United Kingdom"]

eu = ["Austria", "Belgium", "Bosnia Herzegovina", "Bulgaria", "Croatia", "Czech Republic", "Denmark",
      "Estonia", "Finland", "France/Corsica", "France", "French Guiana", "Mayotte", "Germany", 
      "Greece/Crete/Rhodes", "Greece", "Southern (Greek) Cyprus", "Hungary", "Irish Republic", "Ireland", "Italy", "Italy/Sardinia", "Latvia",
      "Guadeloupe", "Lithuania", "Luxembourg", "Netherlands", "Martinique", "Malta", "Holland", "Portugal", 
      "Poland", "Madeira/Azores", "Reunion Island", "Romania", "Slovakia", "Slovenia", "Sweden", "Spain", 
      "Spain/Balearic", "Canary Islands"  , "Portugal/Portucalense/Portugal", "Turkish Republic of North Cyprus"
      ]

non_eu_europe = ["Albania", "Andorra", "Azerbaijan", "Armenia", "Belarus", "Aland Islands", "Georgia", 
                 "Gibraltar", "Vatican", "Iceland", "Liechtenstein", "Monaco", "Moldova", "Montenegro",
                 "Norway", "Serbia", "Switzerland", "Macedonia", "North Cyprus", "South Cyprus", "Kosova"]


africa = ["Algeria", "Angola", "Botswana", "Burundi", "Cameroon", "Cape Verde Islands", "Central African Rep",
          "Chad", "Comoros", "Congo (Brazzaville)", "Democratic Republic of Congo", "Benin", "South Sudan",
          "Equatorial Guinea", "Ethiopia", "South Sudan", "Eritrea", "Djibouti", "Gabon", "Gambia", "Ghana",
          "Guinea", "Ivory Coast", "Kenya", "Lesotho", "Liberia", "Libya", "Madagascar", "Malawi", "Mali",
          "Mauritania", "Mauritius", "Morocco", "Mozambique", "Namibia", "Niger", "Nigeria", "Guinea - Bissau",
          "Rwanda", "Sao Tome", "Senegal", "Seychelles", "Sierra Leone", "Somalia", "South Africa", "Zimbabwe",
          "North Sudan", "Swaziland", "Togo", "Tunisia", "Uganda", "Egypt", "Tanzania", "Tanzania/Zanzibar", 
          "Burkina Faso", "Zambia", "Benin (formerly Dahomey)"]


asia = ["Afghanistan", "Bahrain", "Bangladesh", "Bhutan", "Brunei", "Myanmar (Burma)", "Cambodia/Kampuchea", 
        "Sri Lanka", "China (excl Taiwan)/Tibet", "China/Tibet", "Cambodia", "Taiwan", "Palestine", "Hong Kong", "Hong Kong Special", 
        "India", "Bali/Borneo/Indonesia", "Indonesia", "Iran", "Iraq", "Israel", "Japan", "Kazakhstan", "Jordan", 
        "North Korea", "South Korea", "Korea, South Rep", "Korea, North", "Kuwait", "Kyrgyzstan", "Laos", "Lebanon", "Macao", "Malaysia", "Maldives",
        "Mongolia", "Oman", "Nepal", "Pakistan", "Philippines", "East Timor", "Qatar", "Russia", "Saudi Arabia",
        "Singapore", "Vietnam", "Syria", "Tajikistan", "Thailand", "Turkey", "United Arab Emirates", 
        "Turkmenistan", "Ukraine", "Uzbekistan", "Yemen (North & South)"]

oceania = ["Australia", "Christmas Is/Oceania", "Cocos I/Oceania", "Cook Is/Oceania", "Fiji", "Oceania Islands",
           "Guam", "Nauru/Oceania", "Vanuatu", "New Zealand", "Niue Island", "Norfolk Island", "Micronesia",
           "Marshall Island", "Palau", "Papua New Guinea", "Wallis & Futuna Islands", "Samoa", 
           "Ellice Island/Oceania", "Tonga/Oceania"]


other = ["American Samoa/Oceania", "Antartica", "Antarctica etc (Foreign)", "Antigua",  "Bermuda", "Bouvet Island",
         "British Indian Ocean Territory", "Solomon Island", "Virgin Islands (Br)", "Cayman Islands",
         "Faroe Islands", "Falkland Is/British Antarctic", "South Georgia/South Sanwich Islands", 
         "French Polynesia/Tahiti", "French Sthrn/Antarctic Territories", "Greenland", "Grenada",
         "Heard & McDonald Islands", "Curacao", "Bonaire", "St Maarten", "Montserrat", "Antilles", "Aruba",
         "Netherlands Antilles", "New Caledonia", "Mariana Island", "Pacific Islands", "Pitcairn Islands", 
         "St Barthelemy", "Ascension Islands/St Helena/Trist", "Nevis/St Kitts", "Anguilla", "St Lucia",
         "St Martin", "St Pierre et Miquelon", "Grenadines/St Vincent", "San Marino", "Turks & Caicos Islands",
         "British Overseas", "Guernsey", "Jersey", "Isle Of Man", "Channel Islands"] 
        #carribean countries, british overseas territory included

not_disclosed = ["Country not disclosed", "Stateless", "Country Not Stated", "Short Haul", 
                 "Cruise - Europe/Departures", "Cruise - Elsewhere/Departures", 
                 "Cruise - Europe/Arrivals - UK Ship", "Cruise - Europe/Arrivals - Foreign Ship", 
                 "Cruise - Europe/Arrivals - DK Ship", "Cruise - Elsewhere/Arrivals - UK Ship", 
                 "Cruise - Elsewhere/Arrivals - Foreign Ship", "Cruise - Elsewhere/Arrivals - DK Ship", 
                 "Cruise - DK where - Arr & Dep"]

#function to categorize countries
def create_broad_nationality_column(country):
    """
    This function creates a new column Broad_Purpose which is derived from the column Purpose_Label.
    Broad_Purpose catgorises the different purposes of visit into 4 main categories. These purposes will
    be further filtered later in this notebook.
    Broad_Purpose is the main variable of interest in this project.

    Parameters:
    param1 : the dataframe being manipulated.

    Returns:
    string: The broad nationality that the country belongs to.
    """
    
    if country in north_america:
        return "North America"
    elif country in south_and_central_america:
        return "South America"
    elif country in eu:
        return "EU"
    elif country in uk:
        return "UK"
    elif country in non_eu_europe:
        return "Non-EU Europe"
    elif country in africa:
        return "Africa"
    elif country in asia:
        return "Asia"
    elif country in oceania:
        return "Australia, NZ and Oceania"
    elif country in other:
        return "Other"   
    else:
        return "Not disclosed"

#call this function for df_qcontcust of each year
dataframes = [df_qcontcust_2009, df_qcontcust_2010, df_qcontcust_2011, df_qcontcust_2012,
              df_qcontcust_2013, df_qcontcust_2014, df_qcontcust_2015, df_qcontcust_2016,
              df_qcontcust_2017, df_qcontcust_2018, df_qcontcust_2019, df_qcontcust_2022]

#iterate over the list of dataframes and apply the function for all years
for df in dataframes:
    df['Broad_Nationality'] = df['Nationality_Label'].apply(create_broad_nationality_column)
    

In [84]:
df_qcontcust_2019['Broad_Nationality'].value_counts()

Broad_Nationality
UK                           116349
EU                            54189
Asia                          19726
North America                 15671
Australia, NZ and Oceania      3353
Africa                         2951
Non-EU Europe                  2803
South America                  1886
Not disclosed                   800
Other                           119
Name: count, dtype: int64