In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import shap
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import missingno as mno
import plotly.offline as pyo 
import plotly.io as pio
color_pal = sns.color_palette()
import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='colorblind')
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Data Exploration

In [None]:
df = pd.read_csv('/kaggle/input/global-missing-migrants-dataset/Global Missing Migrants Dataset.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
unique_values = df.nunique()
unique_values

In [None]:
cols = df.columns
cols

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.describe(include = 'object').T

# null values 

In [None]:
df.isna().sum()

In [None]:
df.isna().sum().plot(kind='bar')

In [None]:
# plot for Number of rows in each column
# figsize
plt.figure(figsize=(14,6),dpi=200)

# plot
mno.bar(df)   # Using missingno 

# labels
plt.title('Number of rows in each column',fontsize=35)
plt.tight_layout()
plt.show()

In [None]:
# Fill missing values in the 'Number of Dead' column with 0 (assuming no deaths recorded)
df['Number of Dead'].fillna(0, inplace=True)

# Fill missing values in the 'Region of Origin' column with 'Unknown'
df['Region of Origin'].fillna('Unknown', inplace=True)

# Fill missing values in the 'Country of Origin' column with 'Unknown'
df['Country of Origin'].fillna('Unknown', inplace=True)

# Fill missing values in the 'Migration route' column with 'Unknown'
df['Migration route'].fillna('Unknown', inplace=True)

In [None]:
df.dropna(axis=1,inplace=True)
print(df.shape)

# Data visualisation


In [None]:
# Select the top 10 records based on 'Total Number of Dead and Missing'
top_10_Value_Total_Number_of_Dead = df.nlargest(10, 'Total Number of Dead and Missing')

# Create a bar plot using Plotly Express
fig1 = px.bar(top_10_Value_Total_Number_of_Dead, x='Country of Origin', y='Total Number of Dead and Missing', orientation='h', text='Cause of Death',
              color='Number of Dead', labels={'top_10_Value_Total_Number_of_Dead': 'top_10_Value_Total_Number_of_Dead (in Country of Origin)'},
              color_continuous_scale='Viridis')  

# Customize plot traces
fig1.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5,
                    opacity=0.8, textposition='inside')

# Customize layout
fig1.update_layout(title_text='Top 10 top_10_Value_Total_Number_of_Dead in  Country of Origin', yaxis_title='top_10_Value_Total_Number_of_Dead',
                   xaxis_title='Country of Origin', height=1200)
# Display the plot
fig1.show()

In [None]:
gender_counts = df[['Number of Females', 'Number of Males', 'Number of Children']].sum()
fig_gender = px.bar(gender_counts, x=gender_counts.index, y=gender_counts.values, labels={'x': 'Gender', 'y': 'Count'},title='Monthly Trends of Total Deaths and Missing')
fig_gender.show()

In [None]:
monthly_trends = df.groupby('Reported Month')['Total Number of Dead and Missing'].sum().reset_index()

fig_monthly_trends = px.line(monthly_trends, x='Reported Month', y='Total Number of Dead and Missing', 
                             labels={'Reported Month': 'Month', 'Total Number of Dead and Missing': 'Total Count'},
                             title='Monthly Trends of Total Deaths and Missing')
fig_monthly_trends.show()

In [None]:
average_prices = df.groupby('Country of Origin')['Number of Dead'].mean().reset_index()
average_prices = average_prices.sort_values(by='Number of Dead', ascending=False)

top_5_expensive_brands = average_prices.head(5)
fig = go.Figure()
fig.add_trace(go.Bar(x=top_5_expensive_brands['Country of Origin'], y=top_5_expensive_brands['Number of Dead']))

fig.update_layout(title='Top 5 Country by Number of Dead',
                  xaxis_title='Country of Origin',
                  yaxis_title='Total Number of Dead',
                  plot_bgcolor='black', 
                  paper_bgcolor='#595964',  
                  font=dict(color='white'))

pyo.init_notebook_mode(connected=True)
pyo.iplot(fig)

In [None]:
df.head()

In [None]:
df[df['Country of Origin']=='Mexico']["Total Number of Dead and Missing"].value_counts().plot(kind="bar")

In [None]:
df[df['Country of Origin']=='Guatemala']["Total Number of Dead and Missing"].value_counts().plot(kind="bar")

In [None]:
df[df['Country of Origin']=='Sudan']["Total Number of Dead and Missing"].value_counts().plot(kind="bar")

In [None]:
df.corr()

In [None]:
# Sunburst chart for the distribution of Country
fig2 = px.sunburst(df, path=['Incident Type'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of Incident Type', height=500)
fig2.show()

# Sunburst chart for the distribution of Country
fig2 = px.sunburst(df, path=['Number of Dead'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of Number of Dead', height=500)
fig2.show()


# Sunburst chart for the distribution of Country
fig2 = px.sunburst(df, path=['Region of Incident'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of Region of Incident', height=500)
fig2.show()


# Sunburst chart for the distribution of Country
fig2 = px.sunburst(df, path=['Reported Month'], color_discrete_sequence=px.colors.qualitative.Set3)
fig2.update_layout(title_text='Distribution of Reported Month', height=500)
fig2.show()


In [None]:
Corr_Matrix = df.corr()

# Set up the figure and plot the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(Corr_Matrix, annot=True, cmap='coolwarm', center=0)
plt.show()

In [None]:
#correlation of fraud transaction with target features
plt.figure(figsize=(10, 7))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix[['Total Number of Dead and Missing']], annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation with Total Number of Dead and Missing')
plt.show()

In [None]:
sns.displot(data=df, x="Incident year", kde=True)

In [None]:
# Distribution of Popularity
#plt.figure(figsize=(10, 6))
sns.histplot(df['Incident Type'], kde=True)
plt.title('Distribution of Incident Type')
plt.xlabel('Incident Type')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Distribution of Popularity
#plt.figure(figsize=(10, 6))
sns.histplot(df['Reported Month'], kde=True)
plt.title('Distribution of Reported Month')
plt.xlabel('Reported Month')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.lmplot(x="Number of Males", y="Number of Dead", data=df)

In [None]:
sns.lmplot(x="Number of Females", y="Number of Dead", data=df)

In [None]:
filteed_df1 = df.groupby(['Region of Incident']).median()[['Total Number of Dead and Missing']].sort_values('Total Number of Dead and Missing', ascending = False)

# Create a bar plot using Plotly Express
fig = px.bar(filteed_df1.head())

# Update graph layout/styles
fig.update_layout(title={'text': 'Top 5 Total Number of Dead and Missing country in the World', 'x': 0.5})

# Show the graph
fig.show()

In [None]:
# Create a bar plot using Plotly Express
fig = px.bar(df, x='Reported Month', y='Total Number of Dead and Missing')

# Update graph layout/styles
fig.update_layout(title={'text': 'Number of Dead/Missing in Reported Month', 'x': 0.5})

# Show the graph
fig.show()

In [None]:
plt.plot(df['Incident year'], df['Total Number of Dead and Missing'], 'o--b', lw=3, ms=10)

plt.xlabel('Year', fontsize=16)
plt.ylabel('Number of Dead', fontsize=16)
plt.title('Total Number of Dead and Missing', fontsize=20)

plt.show();


In [None]:
list(set(df.dtypes.tolist()))

df_num = df.select_dtypes(include = ['float64', 'int64'])
df_num.head()

In [None]:
df_num.hist(figsize=(16, 20), bins=50, xlabelsize=8, ylabelsize=8);

# Finding duplicate

In [None]:
# Finding duplicate rows
duplicate_rows = df[df.duplicated(keep='first')]

# Number of duplicate rows
num_duplicates = duplicate_rows.shape[0]

# Displaying the duplicate rows
print(f"Number of duplicate rows: {num_duplicates}")
duplicate_rows

In [None]:
df.drop_duplicates(keep='first', inplace=True)

# Categorical

In [None]:
num_cols = df.select_dtypes(include='object').columns.to_list() # selecting numerical columns

for x in num_cols :
    df[x] = le.fit_transform(df[x])
    
df.head()    

# Reduce Unnecessary Columns

In [None]:
Corr_Matrix = df.corr()

# Set up the figure and plot the heatmap
plt.figure(figsize=(15, 15))
sns.heatmap(Corr_Matrix, annot=True, cmap='coolwarm', center=0)
plt.show()

In [None]:
print('Top 5 Most Positively Correlated to the Total Number of Dead and Missing')
Corr_Matrix['Total Number of Dead and Missing'].sort_values(ascending=False).head(5)

In [None]:
print('Top 5 Most Negatively Correlated to Total Number of Dead and Missing')
Corr_Matrix['Total Number of Dead and Missing'].sort_values(ascending=True).head(5)

In [None]:
columns_to_drop = [col for col in Corr_Matrix.columns if abs(Corr_Matrix.loc['Total Number of Dead and Missing', col]) < 0.3]
columns_to_drop

In [None]:
df = df.drop(columns_to_drop, axis=1)
df.shape

# model

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# spliting the dataset

In [None]:
X = df.drop(columns=['Total Number of Dead and Missing'])
y = df['Total Number of Dead and Missing']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

# Model Building and Analysis

In [None]:
models = {
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual Number of Dead'] = y_test
    submit['Predict_Number of Dead'] = y_pred
    submit = submit.reset_index()
    r2 = r2_score(y_test, y_pred)
    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print(submit.head(5))

    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")

# feature_importances

In [None]:
importances = model.feature_importances_

feature_names = X.columns

feature_importance_dict = dict(zip(feature_names, importances))

sorted_feature_importance = sorted(feature_importance_dict.items(), key=lambda x: x[1], reverse=True)

for feature, importance in sorted_feature_importance:
    print(f"{feature}: {importance:.2f}")

plt.figure(figsize=(12, 7))
plt.barh(*zip(*sorted_feature_importance), alpha=0.9, color='teal')
plt.title('Feature Importance', fontsize=15)
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.show()


In [None]:
y_pred= model.predict(X_test)

# Residuals
residuals = y_test - y_pred

# Plot
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_pred, y=residuals)
plt.axhline(y=0, color='r', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()