## Population figures for countries, regions (e.g. Asia) and the world

Members: Alaica Pineda, Hans Lawrence Del Rosario

Dataset URL = https://datahub.io/core/population?fbclid=IwY2xjawGJO4ZleHRuA2FlbQIxMAABHTbe8Q__nbu9jnwduB92JXX8aj5E1wg73kl1rFDOGSeB98l5SSPT_sJVRQ_aem_7jgC_1x4WnwXtLER93PFzA

### Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import  mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score

In [None]:
dataset_url = 'https://raw.githubusercontent.com/datasets/population/refs/heads/main/data/population.csv'


## Exploratory Data Analysis

In [None]:
population = pd.read_csv(dataset_url)
population.head()

In [None]:
population.dtypes

In [None]:
population['Year'].describe()

In [None]:
population.tail()

In [None]:
# Sanity check
population.shape

In [None]:
population.info()

In [None]:
missing_values = population.isnull().sum()
missing_values

In [None]:
# duplicates
population.duplicated().sum()

In [None]:
unique_values = population['Country Name'].unique()
unique_values

## Exploratory Data Analysis

In [None]:
population.describe().T

In [None]:
population.describe(include="object")

In [None]:
population['Year'] = pd.to_datetime(population['Year'], format='%Y')
yearly_values = population.groupby(population['Year'].dt.year)['Value'].sum()
yearly_values

## Visualization

In [None]:
# 1. Population Growth Over Time
plt.figure(figsize=(10, 5))
plt.plot(yearly_values.index, yearly_values.values)
plt.title('Global Population Growth Over Time')
plt.xlabel('Year')
plt.ylabel('Total Population')
plt.show()

# 2. Top 10 Countries by Population
top_countries = population.groupby('Country Name')['Value'].max().nlargest(10)
top_countries.plot(kind='bar', title='Top 10 Countries by Population')
plt.show()

# 3. Population Comparison for Selected Regions
selected_countries = population[population['Country Name'].isin(['India', 'China', 'USA'])]
for country in selected_countries['Country Name'].unique():
    country_data = selected_countries[selected_countries['Country Name'] == country]
    plt.plot(country_data['Year'], country_data['Value'], label=country)
plt.legend()
plt.title('Population value yearly_values of Selected Countries')
plt.show()

In [None]:
# List of regions to plot against population data
regions = [
    'East Asia & Pacific (excluding high income)',
    'Early-demographic dividend',
    'East Asia & Pacific',
    'Europe & Central Asia (excluding high income)',
    'Europe & Central Asia',
    'Ecuador',
    'Egypt, Arab Rep.',
    'Philippines'
]

# Scatter plot for each region against a chosen population variable
for region in regions:
    sns.scatterplot(data=population[population['Country Name'] == region],
                     x='Year',
                     y='Value')  # Use appropriate columns from your dataset
    plt.title(f'Scatter Plot of {region} vs Population')
    plt.xlabel('Year')
    plt.ylabel('Population')
    plt.show()


## Model Development

In [None]:
X = population[['Year']]
y = population['Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 1. Linear Regression
lr = LinearRegression().fit(X_train, y_train)

# 2. Decision Tree
dt = DecisionTreeRegressor().fit(X_train, y_train)

# 3. Random Forest
rf = RandomForestRegressor().fit(X_train, y_train)

# 4. Support Vector Machine
lrr= LogisticRegression(max_iter=1000).fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model Evaluation

In [None]:
population = population.dropna()
population['Growth'] = (population['Value'].diff() > 0).astype(int)

X = population[['Year']]
y = population['Growth']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

precision = precision_score(y_test, y_pred)
print(f"Model Precision: {precision * 100:.2f}%")

recall = recall_score(y_test, y_pred)
print(f"Model Recall: {recall * 100:.2f}%")


In [None]:
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),  # More trees for better performance
    'Linear Regression': LinearRegression()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    # For Linear Regression, we need to convert predictions to binary for accuracy metrics
    if name == 'Linear Regression':
        y_pred_binary = (y_pred > 0.5).astype(int)
    else:
        y_pred_binary = y_pred

    # Calculate evaluation metrics
    accuracy = accuracy_score(y_test, y_pred_binary)
    precision = precision_score(y_test, y_pred_binary)
    recall = recall_score(y_test, y_pred_binary)

    print(f"{name} - Model Accuracy: {accuracy * 100:.2f}%")
    print(f"{name} - Model Precision: {precision * 100:.2f}%")
    print(f"{name} - Model Recall: {recall * 100:.2f}%")