In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style()
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='darkgrid', palette='colorblind')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

In [None]:
df=pd.read_csv("/kaggle/input/global-missing-migrants-dataset/Global Missing Migrants Dataset.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.dtypes

In [None]:
df.shape

In [None]:
df[['latitude', 'longitude']] = df['Coordinates'].str.split(', ', expand=True).astype(float)


In [None]:
df.describe().T

In [None]:
df.isna().sum()

In [None]:
df.isna().sum().plot(kind='bar')

In [None]:
#df=df.drop(columns='Migration route')

# **Fill And Drop NA  **

In [None]:
df['Number of Dead'].fillna(0, inplace=True)
# Fill NAN values with Unknown
df['Region of Origin'].fillna('Unknown', inplace=True)
# Fill NaN values with Unknown for Country of Origin, Migration route 
df['Country of Origin'].fillna('Unknown', inplace=True)
df['Migration route'].fillna('Unknown', inplace=True)


In [None]:
df.dropna(axis=1,inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.shape

# **VISUALIZATION**

# **Gender distribution**

In [None]:
gender_counts = df[['Number of Females', 'Number of Males', 'Number of Children']].sum()
fig_gender = px.bar(gender_counts, x=gender_counts.index, y=gender_counts.values, labels={'x': 'Gender', 'y': 'Count'},title='Monthly Trends of Total Deaths and Missing')
fig_gender.show()

# **Time-based analysis: Monthly trends**

In [None]:
monthly_trends = df.groupby('Reported Month')['Total Number of Dead and Missing'].sum().reset_index()

fig_monthly_trends = px.line(monthly_trends, x='Reported Month', y='Total Number of Dead and Missing', 
                             labels={'Reported Month': 'Month', 'Total Number of Dead and Missing': 'Total Count'},
                             title='Monthly Trends of Total Deaths and Missing')
fig_monthly_trends.show()

# **HEAT MAP**

In [None]:
df.corr()


In [None]:
Corr_Matrix = df.corr()

# Set up the figure and plot the heatmap
plt.figure(figsize=(10, 10))
sns.heatmap(Corr_Matrix, annot=True, cmap='coolwarm', center=0)
plt.show()


In [None]:
#correlation of fraud transaction with other features
plt.figure(figsize=(10, 7))
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix[['Total Number of Dead and Missing']], annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation with Total Number of Dead and Missing')
plt.show()

In [None]:
sns.scatterplot(data=df, x="Incident year", y="Incident Type")

In [None]:
# Distribution of Popularity
#plt.figure(figsize=(10, 6))
sns.histplot(df['Region of Origin'], kde=True)
plt.title('Distribution of Region of Origin')
plt.xlabel('Region of Origin')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

In [None]:
sns.catplot(data=df, y="Total Number of Dead and Missing",  kind="box") 

In [None]:
plt.figure(figsize=(6, 6))
df['Reported Month'].value_counts().plot(kind='pie', autopct='%1.1f%%', colors=['lightblue', 'lightcoral','r','b'])
plt.title('Reported Month percentage')
plt.show()

In [None]:
# Visualize the relationship between average temperature and season
plt.figure(figsize=(10, 6))
sns.boxplot(x='Reported Month', y='Total Number of Dead and Missing', data=df)
plt.title('Average Total Number of Dead and Missing Distribution by Reported Month')
plt.show()

In [None]:
max_value_row = df.loc[df['Total Number of Dead and Missing'].idxmax()]
max_value_row


In [None]:
df.groupby(['Region of Incident']).median()[['Total Number of Dead and Missing']].sort_values('Total Number of Dead and Missing', ascending = False).head().plot(kind='barh', color = 'teal', grid = True, figsize = (8,2))
plt.xlabel('Total Number of Dead and Missing')
plt.ylabel('Location')
plt.title('Top 5 Total Number of Dead and Missing country in the World')
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()

In [None]:
df.dtypes

In [None]:
df.head()

# **categorical**

In [None]:
df['Incident Type'] = le.fit_transform(df['Incident Type'])
df['Reported Month'] = le.fit_transform(df['Reported Month'])
df['Region of Origin'] = le.fit_transform(df['Region of Origin'])
df['Region of Incident'] = le.fit_transform(df['Region of Incident'])
df['Country of Origin'] = le.fit_transform(df['Country of Origin'])
df['Cause of Death'] = le.fit_transform(df['Cause of Death'])
df['Cause of Death'] = le.fit_transform(df['Cause of Death'])
df['Migration route'] = le.fit_transform(df['Migration route'])


# **heatmap after categorical**

In [None]:
Corr_Matrix = df.corr()

# Set up the figure and plot the heatmap
plt.figure(figsize=(25, 25))
sns.heatmap(Corr_Matrix, annot=True, cmap='coolwarm', center=0)
plt.show()


In [None]:
df.head()

# **Top 5 Most Positively Correlated**

In [None]:
print('Top 5 Most Positively Correlated to the Total Number of Dead and Missing')
Corr_Matrix['Total Number of Dead and Missing'].sort_values(ascending=False).head(5)

# **Top 5 Most Negatively Correlated**

In [None]:
print('Top 5 Most Negatively Correlated to Total Number of Dead and Missing')
Corr_Matrix['Total Number of Dead and Missing'].sort_values(ascending=True).head(5)

# **MODEL**

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# **split**

In [None]:
X = df.drop(columns=['Total Number of Dead and Missing',"Location of death"])
y = df['Total Number of Dead and Missing']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display the shapes of the resulting datasets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
}
best_model = None
best_r2 = 0

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred= model.predict(X_test)

    # Evaluate the model
    r2 = r2_score(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    submit = pd.DataFrame()
    submit['Actual Price'] = y_test
    submit['Predict_price'] = y_pred
    submit = submit.reset_index()
    print(submit.head(8))
    r2 = r2_score(y_test, y_pred)

    if r2 > best_r2:
        best_r2 = r2
        best_model = model.__class__.__name__

    print(f'{model_name}:')
    print(f'R2 Score: {r2:.2f}')
    print(f'Mean Absolute Error (MAE): {mae:.2f}')
    print(f'Root Mean Squared Error (RMSE): {rmse:.2f}')
    print('----------------------------------------')
print(f"The best performing model is: {best_model} with accuracy: {best_r2:.2f}")