In [19]:
import pandas as pd
import matplotlib.pyplot as plt

# Load dataset (replace 'your_dataset.csv' with the actual file path or URL)
dataset_path = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data.csv"
df = pd.read_csv(dataset_path)

df.head()


Unnamed: 0,Year,Country of origin,Country of origin (ISO),Country of asylum,Country of asylum (ISO),Refugees under UNHCR's mandate,Asylum-seekers,IDPs of concern to UNHCR,Other people in need of international protection,Stateless persons,Host Community,Others of concern
0,1951,Unknown,UNK,Australia,AUS,180000,0,0,-,0,-,0
1,1951,Unknown,UNK,Austria,AUT,282000,0,0,-,0,-,0
2,1951,Unknown,UNK,Belgium,BEL,55000,0,0,-,0,-,0
3,1951,Unknown,UNK,Canada,CAN,168511,0,0,-,0,-,0
4,1951,Unknown,UNK,Denmark,DNK,2000,0,0,-,0,-,0


Data Cleaning

In [26]:
import pandas as pd

# Function to replace non-numeric values with zero
def replace_non_numeric(data):
    if pd.api.types.is_numeric_dtype(data):
        return data.fillna(0)
    else:
        return 0

# Load the dataset
def process_csv(input_file, output_file):
    # Read the CSV file
    df = pd.read_csv(input_file)

    # Drop the "Country of origin (ISO)" and "Country of asylum (ISO)" columns
    df.drop(columns=['Country of origin (ISO)', 'Country of asylum (ISO)'], inplace=True)

    # Replace non-numeric values with zero in specified columns
    cols_to_replace_zero = ['Refugees under UNHCR\'s mandate', 'Asylum-seekers', 'IDPs of concern to UNHCR', 
                            'Other people in need of international protection', 'Stateless persons', 
                            'Host Community', 'Others of concern']
    for col in cols_to_replace_zero:
        df[col] = replace_non_numeric(df[col])

    # Move "Refugees under UNHCR's mandate" column to last position
    df = df[[col for col in df if col != "Refugees under UNHCR's mandate"] + ["Refugees under UNHCR's mandate"]]

    # Save the updated dataset to a new CSV file
    df.to_csv(output_file, index=False)
    print("Dataset processed and saved to", output_file)

# Example usage
input_file = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data.csv"  # Change this to the path of your input CSV file
output_file = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data cleaned.csv"  # Change this to the desired output CSV file path
process_csv(input_file, output_file)

df = pd.read_csv(output_file)

df.head()


Dataset processed and saved to G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data cleaned.csv


Unnamed: 0,Year,Country of origin,Country of asylum,Asylum-seekers,IDPs of concern to UNHCR,Other people in need of international protection,Stateless persons,Host Community,Others of concern,Refugees under UNHCR's mandate
0,1951,Unknown,Australia,0,0,0,0,0,0,180000
1,1951,Unknown,Austria,0,0,0,0,0,0,282000
2,1951,Unknown,Belgium,0,0,0,0,0,0,55000
3,1951,Unknown,Canada,0,0,0,0,0,0,168511
4,1951,Unknown,Denmark,0,0,0,0,0,0,2000


AUGMENTATION

In [36]:
import pandas as pd

# Load the dataset
input_file = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data cleaned.csv"  # Change this to the path of your input CSV file
output_file = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data cleaned_Augmented.csv"  # Change this to the desired output CSV file path
df = pd.read_csv(input_file)

# Perform one-hot encoding for "Country of origin" and "Country of asylum" columns
df = pd.get_dummies(df, columns=['Country of origin', 'Country of asylum'], drop_first=True, dummy_na=False)

# Move "Refugees under UNHCR's mandate" column to the rightmost position
column_order = [col for col in df.columns if col != "Refugees under UNHCR's mandate"] + ["Refugees under UNHCR's mandate"]
df = df[column_order]

# Replace True/False values with 1/0
df = df.replace({True: 1, False: 0})

# Save the updated dataset to a new CSV file
df.to_csv(output_file, index=False)
print("Dataset with one-hot encoding and binary encoding saved to", output_file)


Dataset with one-hot encoding and binary encoding saved to G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data cleaned_Augmented.csv


In [37]:
import pandas as pd

# Load the dataset
input_file = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data cleaned_Augmented.csv"
df = pd.read_csv(input_file)

# Get the number of rows and columns
num_rows, num_columns = df.shape

print("Number of rows:", num_rows)
print("Number of columns:", num_columns)


Number of rows: 125689
Number of columns: 413


In [38]:
import pandas as pd

# Load the encoded dataset (replace 'your_dataset.csv' with the actual file path or URL)
input_file = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data cleaned_Augmented.csv"
df_encoded = pd.read_csv(input_file)

# Calculate the correlation matrix
correlation_matrix = df_encoded.corr()

# Set a correlation threshold (adjust as needed)
correlation_threshold = 0.8

# Identify highly correlated variables
highly_correlated_vars = set()
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > correlation_threshold:
            colname = correlation_matrix.columns[i]
            highly_correlated_vars.add(colname)

# Print the columns to be dropped
print(f"Columns to be dropped due to high correlation: {highly_correlated_vars}")

# Count the number of columns to be dropped
num_columns_dropped = len(highly_correlated_vars)
print(f"Number of columns dropped: {num_columns_dropped}")

# Drop highly correlated variables from the dataset
df_filtered = df_encoded.drop(columns=highly_correlated_vars)

# Save the filtered dataset to a new CSV file
output_file = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_filtered.to_csv(output_file, index=False)
print("Filtered dataset saved to", output_file)



Columns to be dropped due to high correlation: set()
Number of columns dropped: 0
Filtered dataset saved to G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv


In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer  # Import the imputer
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import Callback, TensorBoard
import datetime
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train a Linear Regression model
linear_reg_model = LinearRegression()
linear_reg_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = linear_reg_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")


Mean Squared Error: 4301288995.995337
Root Mean Squared Error (RMSE): 65584.21300888908
R-squared (R2): 0.14162693395981607


Ridge Regression:

In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train a Ridge Regression model
ridge_reg_model = Ridge(alpha=1.0)  # You can adjust the regularization strength (alpha) if needed
ridge_reg_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = ridge_reg_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")


Mean Squared Error: 4301297376.0313425
Root Mean Squared Error (RMSE): 65584.27689645851
R-squared (R2): 0.141625261624568


Lasso Regression:

In [41]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train a Lasso Regression model
lasso_reg_model = Lasso(alpha=1.0)  # You can adjust the regularization strength (alpha) if needed
lasso_reg_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = lasso_reg_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")


Mean Squared Error: 4301447634.230946
Root Mean Squared Error (RMSE): 65585.42242168564
R-squared (R2): 0.14159527582458786


  model = cd_fast.enet_coordinate_descent(


In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train an ElasticNet Regression model
elastic_net_model = ElasticNet(alpha=1.0, l1_ratio=0.5)  # You can adjust alpha and l1_ratio as needed
elastic_net_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = elastic_net_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")


Mean Squared Error: 4465953410.611945
Root Mean Squared Error (RMSE): 66827.78920936967
R-squared (R2): 0.10876620347326504


Decision Tree Regression

In [43]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train a Decision Tree Regression model
decision_tree_model = DecisionTreeRegressor(max_depth=5)  # You can adjust the max_depth parameter as needed
decision_tree_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = decision_tree_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")


Mean Squared Error: 3715896262.471197
Root Mean Squared Error (RMSE): 60958.15173109497
R-squared (R2): 0.2584489740460769


Random Forest Regression:

In [44]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train a Random Forest Regression model
random_forest_model = RandomForestRegressor(n_estimators=100, max_depth=5)  # You can adjust parameters as needed
random_forest_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = random_forest_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")


Mean Squared Error: 3060083468.808533
Root Mean Squared Error (RMSE): 55318.02119389786
R-squared (R2): 0.38932416959064753


Gradient Boosting Regression model

In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train a Gradient Boosting Regression model
gradient_boosting_model = GradientBoostingRegressor(n_estimators=100, max_depth=5)  # You can adjust parameters as needed
gradient_boosting_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = gradient_boosting_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")


Mean Squared Error: 2067040105.3634045
Root Mean Squared Error (RMSE): 45464.7127491575
R-squared (R2): 0.5874977118438813


Support Vector Regression (SVR):

In [47]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train a Support Vector Regression model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.1)  # You can adjust parameters as needed
svr_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = svr_model.predict(X_test_scaled)



K-Nearest Neighbors (KNN) Regression:

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Train a K-Nearest Neighbors Regression model
knn_model = KNeighborsRegressor(n_neighbors=5)  # You can adjust the number of neighbors (n_neighbors) as needed
knn_model.fit(X_train_scaled, y_train)

# Predictions on the test set
y_pred = knn_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")


neural network regression

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense

# Load the dataset without outliers and with one-hot encoding
input_file_path_encoded = r"G:\My Drive\Internship project\United Nations Refugee Data.csv\United Nations Refugee Data filtered.csv"
df_encoded = pd.read_csv(input_file_path_encoded)

# Define features (X) and target variable (y)
X = df_encoded.drop(columns=['Refugees under UNHCR\'s mandate'])
y = df_encoded['Refugees under UNHCR\'s mandate']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values using SimpleImputer
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Standardize features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Build the Neural Network model
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1))  # Output layer

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
model.fit(X_train_scaled, y_train, epochs=100, batch_size=32, verbose=0)

# Predictions on the test set
y_pred = model.predict(X_test_scaled).flatten()

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Calculate RMSE
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Calculate R2 score
r2 = r2_score(y_test, y_pred)
print(f"R-squared (R2): {r2}")
