In [None]:
Using Linear Regression - ( Best solution compared to other models )

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression  
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import string

# Define a function to remove punctuation from text
def remove_punctuation(text):
    if '%' in str(text):
        # Remove '%' and convert to float
        return float(str(text).replace('%', ''))
    else:
        exclude = set(string.punctuation) - set('.')
        return "".join([char for char in str(text) if char not in exclude])

# Load the data into a DataFrame
data = pd.read_csv('world-data-2023.csv')

# Data cleaning
# Replace missing values (if any) with appropriate values or strategies
data.fillna(0, inplace=True)  # For demonstration purposes, fill missing values with 0

# Convert numeric columns to the appropriate data types
numeric_columns = [
    'Density',
    'Agricultural Land( %)',
    'Armed Forces size',
    'Birth Rate',
    'Co2-Emissions',
    'CPI',
    'Fertility Rate',
    'Forested Area (%)',
    'Gasoline Price',
    'GDP',
    'Gross primary education enrollment (%)',
    'Gross tertiary education enrollment (%)',
    'Infant mortality',
    'Life expectancy',
    'Maternal mortality ratio',
    'Out of pocket health expenditure',
    'Physicians per thousand',
    'Population',
    'Population: Labor force participation (%)',
    'Tax revenue (%)',
    'Total tax rate',
    'Unemployment rate',
    'Land Area(Km2)',
    'Urban_population'
]

for column in numeric_columns:
    # Exclude the target column
    data[column] = data[column].apply(lambda x: remove_punctuation(x))
    data[column] = pd.to_numeric(data[column], errors='coerce')  # Convert to numeric

# Encode categorical variables (if any)
# You can use label encoding for categorical variables
label_encoder = LabelEncoder()
data['Currency-Code'] = data['Currency-Code'].astype(str)
data['Currency-Code'] = label_encoder.fit_transform(data['Currency-Code'])

# Set the target variable (Urban_population)
target = 'Urban_population'

data.dropna(subset=[target], inplace=True)

# Define your target variable and features
y = data[target]
X = data[numeric_columns]  # Use only numeric columns as features

# Replace missing values with the mean (you can choose another strategy if needed)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a regression model (Random Forest Regressor)
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Predict urbanization for all countries
all_countries = data[numeric_columns]  # Use all countries in the dataset for prediction
all_predictions = model.predict(all_countries)

# Create a DataFrame with the predicted urbanization percentages
predictions_df = pd.DataFrame({
    'Country': data['Country'],
    'Predicted_Urbanization': all_predictions
})

# Rank and select the top 20 urbanized countries
top_20_urbanized = predictions_df.sort_values(by='Predicted_Urbanization', ascending=False).head(20)

# Print the top 20 urbanized countries
print(top_20_urbanized)

Mean Absolute Error: 2.6604666688316226e-08




            Country  Predicted_Urbanization
36            China            8.429340e+08
77            India            4.710315e+08
185   United States            2.706630e+08
23           Brazil            1.832416e+08
78        Indonesia            1.515097e+08
85            Japan            1.157824e+08
143          Russia            1.076839e+08
126         Nigeria            1.028069e+08
110          Mexico            1.026269e+08
131        Pakistan            7.992776e+07
64          Germany            6.432484e+07
178          Turkey            6.309782e+07
79             Iran            6.250962e+07
13       Bangladesh            6.098742e+07
184  United Kingdom            5.590832e+07
60           France            5.412336e+07
138     Philippines            5.097590e+07
51            Egypt            4.289582e+07
83            Italy            4.265197e+07
161     South Korea            4.210672e+07


In [None]:
Using HistGradientBoostingRegressor

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
import string
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor

# Define a function to remove punctuation from text
def remove_punctuation(text):
    if '%' in str(text):
        # Remove '%' and convert to float
        return float(str(text).replace('%', ''))
    else:
        exclude = set(string.punctuation) - set('.')
        return "".join([char for char in str(text) if char not in exclude])

# Load the data into a DataFrame
data = pd.read_csv('world-data-2023.csv')

# Data cleaning
# Convert numeric columns to the appropriate data types
numeric_columns = [
    'Density',
    'Agricultural Land( %)',
    'Armed Forces size',
    'Birth Rate',
    'Co2-Emissions',
    'CPI',
    'Fertility Rate',
    'Forested Area (%)',
    'Gasoline Price',
    'GDP',
    'Gross primary education enrollment (%)',
    'Gross tertiary education enrollment (%)',
    'Infant mortality',
    'Life expectancy',
    'Maternal mortality ratio',
    'Out of pocket health expenditure',
    'Physicians per thousand',
    'Population',
    'Population: Labor force participation (%)',
    'Tax revenue (%)',
    'Total tax rate',
    'Unemployment rate',
    'Land Area(Km2)',
    'Urban_population'
]

for column in numeric_columns:
    # Exclude the target column
    data[column] = data[column].apply(lambda x: remove_punctuation(x))
    data[column] = pd.to_numeric(data[column], errors='coerce')  # Convert to numeric

# Encode categorical variables (if any)
# You can use label encoding for categorical variables
label_encoder = LabelEncoder()
data['Currency-Code'] = data['Currency-Code'].astype(str)
data['Currency-Code'] = label_encoder.fit_transform(data['Currency-Code'])

# Set the target variable (Urban_population)
target = 'Urban_population'

# Remove rows with missing values in the target column
data.dropna(subset=[target], inplace=True)

# Define your target variable and features
y = data[target]
X = data[numeric_columns]  # Use only numeric columns as features

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a regression model (Random Forest Regressor)
model = HistGradientBoostingRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Predict urbanization for all countries
all_countries = data[numeric_columns]  # Use all countries in the dataset for prediction
all_predictions = model.predict(all_countries)

# Create a DataFrame with the predicted urbanization percentages
predictions_df = pd.DataFrame({
    'Country': data['Country'],
    'Predicted_Urbanization': all_predictions
})

# Rank and select the top 20 urbanized countries
top_20_urbanized = predictions_df.sort_values(by='Predicted_Urbanization', ascending=False).head(20)

# Print the top 20 urbanized countries
print(top_20_urbanized)


In [None]:
Using RandomForestRegressor

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
import string

# Define a function to remove punctuation from text
def remove_punctuation(text):
    if '%' in str(text):
        # Remove '%' and convert to float
        return float(str(text).replace('%', ''))
    else:
        exclude = set(string.punctuation) - set('.')
        return "".join([char for char in str(text) if char not in exclude])

# Load the data into a DataFrame
data = pd.read_csv('world-data-2023.csv')

# Data cleaning
# Replace missing values (if any) with appropriate values or strategies
data.fillna(0, inplace=True)  # For demonstration purposes, fill missing values with 0

# Convert numeric columns to the appropriate data types
numeric_columns = [
    'Density',
    'Agricultural Land( %)',
    'Armed Forces size',
    'Birth Rate',
    'Co2-Emissions',
    'CPI',
    'Fertility Rate',
    'Forested Area (%)',
    'Gasoline Price',
    'GDP',
    'Gross primary education enrollment (%)',
    'Gross tertiary education enrollment (%)',
    'Infant mortality',
    'Life expectancy',
    'Maternal mortality ratio',
    'Out of pocket health expenditure',
    'Physicians per thousand',
    'Population',
    'Population: Labor force participation (%)',
    'Tax revenue (%)',
    'Total tax rate',
    'Unemployment rate',
    'Land Area(Km2)',
    'Urban_population'
]

for column in numeric_columns:
    # Exclude the target column
    data[column] = data[column].apply(lambda x: remove_punctuation(x))
    data[column] = pd.to_numeric(data[column], errors='coerce')  # Convert to numeric

# Encode categorical variables (if any)
# You can use label encoding for categorical variables
label_encoder = LabelEncoder()
data['Currency-Code'] = data['Currency-Code'].astype(str)
data['Currency-Code'] = label_encoder.fit_transform(data['Currency-Code'])

# Set the target variable (Urban_population)
target = 'Urban_population'

data.dropna(subset=[target], inplace=True)

# Define your target variable and features
y = data[target]
X = data[numeric_columns]  # Use only numeric columns as features

# Replace missing values with the mean (you can choose another strategy if needed)
imputer = SimpleImputer(strategy='mean')
X = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a regression model (Random Forest Regressor)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mae = mean_absolute_error(y_test, y_pred)
print(f'Mean Absolute Error: {mae}')

# Predict urbanization for all countries
all_countries = data[numeric_columns]  # Use all countries in the dataset for prediction
all_predictions = model.predict(all_countries)

# Create a DataFrame with the predicted urbanization percentages
predictions_df = pd.DataFrame({
    'Country': data['Country'],
    'Predicted_Urbanization': all_predictions
})

# Rank and select the top 20 urbanized countries
top_20_urbanized = predictions_df.sort_values(by='Predicted_Urbanization', ascending=False).head(20)

# Print the top 20 urbanized countries
print(top_20_urbanized)


Mean Absolute Error: 567097.4307692308




            Country  Predicted_Urbanization
36            China            6.351494e+08
77            India            4.477171e+08
185   United States            2.715210e+08
23           Brazil            1.653219e+08
78        Indonesia            1.460094e+08
85            Japan            1.241612e+08
143          Russia            1.211527e+08
126         Nigeria            1.059038e+08
110          Mexico            1.053268e+08
131        Pakistan            8.145463e+07
13       Bangladesh            6.827527e+07
64          Germany            6.513724e+07
79             Iran            6.329396e+07
184  United Kingdom            6.278351e+07
178          Turkey            6.134470e+07
60           France            6.017150e+07
138     Philippines            5.414353e+07
83            Italy            4.525440e+07
161     South Korea            4.509959e+07
51            Egypt            4.248702e+07
