# Data analysis for migrants project

## Install libraries

In [1]:
import pandas as pd
import calendar
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

## Load data and clean it

### Get data from website

In [2]:
data = pd.read_csv("https://missingmigrants.iom.int/sites/g/files/tmzbdl601/files/2022-02/Missing_Migrants_Global_Figures.csv")

### Clean up columns and column names

In [3]:
cols = data.columns
new_column_names = []
for col in cols:
    new_col = col.lstrip().rstrip().lower().replace (" ", "_")
    new_column_names.append(new_col)
data.columns = new_column_names
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10274 entries, 0 to 10273
Data columns (total 24 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   main_id                              10274 non-null  object 
 1   incident_id                          10274 non-null  object 
 2   incident_type                        10274 non-null  object 
 3   region_of_incident                   10274 non-null  object 
 4   incident_date                        10259 non-null  object 
 5   incident_year                        10274 non-null  int64  
 6   reported_month                       10274 non-null  object 
 7   number_of_dead                       9844 non-null   float64
 8   minimum_estimated_number_of_missing  975 non-null    float64
 9   total_number_of_dead_and_missing     10274 non-null  int64  
 10  number_of_survivors                  1511 non-null   float64
 11  number_of_females           

### Clean up coordiantes

In [4]:
data["coordinates"] = data["coordinates"].str.replace("POINT","")
data["coordinates"] = data["coordinates"].str.replace("(","")
data["coordinates"] = data["coordinates"].str.replace(")","")
data[['lon', 'lat']] = data['coordinates'].str.split(expand=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10274 entries, 0 to 10273
Data columns (total 26 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   main_id                              10274 non-null  object 
 1   incident_id                          10274 non-null  object 
 2   incident_type                        10274 non-null  object 
 3   region_of_incident                   10274 non-null  object 
 4   incident_date                        10259 non-null  object 
 5   incident_year                        10274 non-null  int64  
 6   reported_month                       10274 non-null  object 
 7   number_of_dead                       9844 non-null   float64
 8   minimum_estimated_number_of_missing  975 non-null    float64
 9   total_number_of_dead_and_missing     10274 non-null  int64  
 10  number_of_survivors                  1511 non-null   float64
 11  number_of_females           

  data["coordinates"] = data["coordinates"].str.replace("(","")
  data["coordinates"] = data["coordinates"].str.replace(")","")


### Clean up cause of death

In [5]:
data["death_cause_clean"]=""
for index,row in data.iterrows():
    cause = (row["cause_of_death"])
    if "Violence" in cause:
        data["death_cause_clean"][index] = "Violence"
    elif "Harsh environmental conditions / lack of adequate shelter, food, water" in cause:
        data["death_cause_clean"][index] = "Harsh environmental conditions"
    elif "Drowning" in cause:
        data["death_cause_clean"][index] = "Drowning"
    elif "Vehicle accident / death linked to hazardous transport" in cause:
        data["death_cause_clean"][index] = "Vehicle accident"
    elif "Sickness / lack of access to adequate healthcare" in cause:
        data["death_cause_clean"][index] = "Sickness inadequate healthcare access"
    elif "Mixed or unknown" in cause:
        data["death_cause_clean"][index] = "Mixed or unknown"
    elif "Accidental death" in cause:
        data["death_cause_clean"][index] = "Accidental death"
    else:
        print(cause)
data.info()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["death_cause_clean"][index] = "Mixed or unknown"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["death_cause_clean"][index] = "Violence"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["death_cause_clean"][index] = "Harsh environmental conditions"
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10274 entries, 0 to 10273
Data columns (total 27 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   main_id                              10274 non-null  object 
 1   incident_id                          10274 non-null  object 
 2   incident_type                        10274 non-null  object 
 3   region_of_incident                   10274 non-null  object 
 4   incident_date                        10259 non-null  object 
 5   incident_year                        10274 non-null  int64  
 6   reported_month                       10274 non-null  object 
 7   number_of_dead                       9844 non-null   float64
 8   minimum_estimated_number_of_missing  975 non-null    float64
 9   total_number_of_dead_and_missing     10274 non-null  int64  
 10  number_of_survivors                  1511 non-null   float64
 11  number_of_females           

## Analysis

### Region and cause

In [9]:
region_and_cause = data.groupby(['region_of_incident','death_cause_clean'])['total_number_of_dead_and_missing'].sum().reset_index()
region_and_cause.rename(columns={'region_of_incident': 'source', 'death_cause_clean': 'target', 'total_number_of_dead_and_missing':'value'}, inplace=True)
region_and_cause

Unnamed: 0,source,target,value
0,Caribbean,Drowning,926
1,Caribbean,Harsh environmental conditions,13
2,Caribbean,Mixed or unknown,63
3,Caribbean,Vehicle accident,15
4,Caribbean,Violence,8
...,...,...,...
93,Western Asia,Harsh environmental conditions,121
94,Western Asia,Mixed or unknown,51
95,Western Asia,Sickness inadequate healthcare access,12
96,Western Asia,Vehicle accident,148


### Causes over time (months)

In [10]:
month_and_cause = data.groupby(['incident_date','death_cause_clean'])['total_number_of_dead_and_missing'].sum().reset_index()
month_and_cause['incident_date'] = pd.to_datetime(month_and_cause['incident_date'])
month_and_cause['month_year'] = month_and_cause['incident_date'].dt.to_period('M')
month_and_cause.loc[:, month_and_cause.columns!='incident_date']
month_and_cause = month_and_cause.groupby(['month_year','death_cause_clean'])['total_number_of_dead_and_missing'].sum().reset_index()
month_and_cause = month_and_cause.pivot(index='month_year', columns='death_cause_clean', values='total_number_of_dead_and_missing').reset_index()
month_and_cause = month_and_cause.fillna(0).astype(int)
month_and_cause

death_cause_clean,month_year,Accidental death,Drowning,Harsh environmental conditions,Mixed or unknown,Sickness inadequate healthcare access,Vehicle accident,Violence
0,528,0,13,1,11,0,1,2
1,529,0,25,1,9,5,0,24
2,530,1,304,2,9,0,11,3
3,531,0,66,11,11,0,7,11
4,532,1,398,0,58,0,3,1
...,...,...,...,...,...,...,...,...
93,621,5,182,31,64,14,58,23
94,622,17,165,35,35,13,67,15
95,623,8,610,12,16,7,99,27
96,624,0,289,21,12,3,17,5


### Immigration routes over years

In [56]:
year_and_route = data.groupby(['incident_year','migration_route'])['total_number_of_dead_and_missing'].sum().reset_index()

year_and_route = year_and_route[(year_and_route.migration_route=='US-Mexico border crossing')|
                                (year_and_route.migration_route=="Afghanistan to Iran")|
                                (year_and_route.migration_route=="Central Mediterranean")|
                               (year_and_route.migration_route=="Eastern Mediterranean")|
                                (year_and_route.migration_route=="Sahara Desert crossing")|
                                (year_and_route.migration_route=="Western Mediterranean")
                               ]

year_and_route = year_and_route[(year_and_route.incident_year!=2022)]
year_and_route = year_and_route.sort_values(["incident_year","total_number_of_dead_and_missing"],ascending=[True, False])
year_and_route.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46 entries, 0 to 134
Data columns (total 3 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   incident_year                     46 non-null     int64 
 1   migration_route                   46 non-null     object
 2   total_number_of_dead_and_missing  46 non-null     int64 
dtypes: int64(2), object(1)
memory usage: 1.4+ KB


### Data for maps

#### US-Mexico border

In [57]:
mexico_to_us = data[data.migration_route == "US-Mexico border crossing"]

#### Europe

In [58]:
regions = ['Europe','Mediterranean','Northern Africa']
europe = data[data['region_of_incident'].isin(regions)]

## Write out data

In [12]:
data.to_csv("../data/data.csv",index=False)
region_and_cause.to_csv("../data/region_and_cause.csv", index = False)
month_and_cause.to_csv("../data/month_and_cause.csv",index=False)
year_and_route.to_csv("../data/year_and_route.csv",index=False)
mexico_to_us.to_csv("../data/mexico_to_us.csv",index=False)
europe.to_csv("../data/europe.csv",index=False)

-30-