In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
from scipy import stats
from sklearn.linear_model import LinearRegression

In [None]:
populationData = pd.read_csv("E:/Projects/python/EDA/GlobalMatrices/Population/population.csv")

#### Data Processing and Cleaning

In [None]:
# Checking if Our Data contain missing values

def findMissing():
    years =populationData['Year']
    values = populationData['Population']
    if years.isna().any() or values.isna().any():
      years = years.fillna(method='ffill')
      values = values.fillna(method='bfill')





def detectOutliers(threshold=3):
    years =populationData['Year']
    values = populationData['Population']
    yearsZScores = np.abs(stats.zscore(years))
    valueZScores =np.abs(stats.zscore(values))
    yOutliers = (yearsZScores > threshold)
    vOutliers = ( valueZScores > threshold)
    years[~yOutliers]
    values[~vOutliers]

findMissing()
detectOutliers()

# Format 

def formatNumber(n):
    if n >= 1_000_000_000:  # Billions
        return f"{n / 1_000_000_000:.0f}B"
    elif n >= 1_000_000:  # Millions
        return f"{n / 1_000_000:.0f}M"
    elif n >= 100_000:  # Hundred Thousands
        return f"{n / 1_000:.0f}tK"
    elif n >= 1_000:  # Thousands
        return f"{n / 1_000:.0f}K"
    else:  # Less than 1,000
        return n


### Increase in the Population in after every 10 years across each county and Linear representation

In [None]:
# Ensuring Correct Columns



"""

tPC = Total Population Change
fY = first Year
lY = last Year

"""

populationData.columns = populationData.columns.str.strip()

# Converting and ensuring to make it numeric
populationData['Year'] = pd.to_numeric(populationData['Year'])

# Total Population Increase
fY = populationData[populationData['Year']==1960].set_index('Country Name')['Population']
lY = populationData[populationData['Year']==2023].set_index('Country Name')['Population']


tPC = pd.DataFrame({'Population-1960': fY, 'Population-2023': lY}).reset_index()


# Total increase
tPC['Total Increase'] = tPC['Population-2023'] - tPC['Population-1960']

# Percentage Increase
tPC['Growth Rate'] = (tPC['Total Increase'] / tPC['Population-1960'])




# Formatting Data
tPC['Growth Rate']= tPC['Growth Rate'].apply(lambda x: f"{x:.1f}%")



print("\nTotal Population Change (1960-2023):")
print(tPC)



### Visualization

In [121]:
# Plotting Trend Line

"""

cD = country Data
sY = selected Years
fY = future Years
pP = predict Population
fD = future Data



"""




def predictPopulation(country):
    cD = populationData[populationData['Country Name'] == country].sort_values('Year')
    cD = cD.sort_values('Year')

    # Selecting the 10 year gap and the  last 3 years
    sY =  list(range(1960, 2024, 10)) + [2020, 2021, 2022, 2023]
    # print("SelectedYears:", sY)
    cD = cD[cD['Year'].isin(sY)]
    # print("CountryData",cD.head())
    # Predicting the Population 2024 - 2040
    XTrain = cD["Year"].values.reshape(-1, 1)
    yTrain = cD["Population"].values
    model = LinearRegression()
    model.fit(XTrain,yTrain)
    #Predict From - To
    fY = np.array(range(2024, 2031)).reshape(-1,1)
    pP= model.predict(fY)
    # print(pP)

    # appending the predicted Data into the data-set
    fD = pd.DataFrame({'Year': fY.flatten(), 'Population':pP, "Country Name": country,  "Predicted": True})
    cD["Predicted"] = False
    fD['Country Name'] = country
    finalData = pd.concat([cD, fD])
    # print(finalData

    return finalData
def plotPopulation():
    countries = populationData['Country Name'].unique()
    totalData = pd.concat([predictPopulation(country) for country in countries])
    initialCountry = countries[0]
    initialData = totalData[totalData["Country Name"] == initialCountry]
    # Plotting Line Chart
    fig = px.line(initialData, x='Year', y='Population', title = f"<b>Population of {initialCountry}</b>", markers=True, labels={"Population": "<b>Population</b>", "Year": "<b>Year</b>"},color_discrete_map={False: "blue", True: "red"}, line_group="Predicted")
    fig.update_traces(hoverinfo="x+y", mode="lines+markers")
    # DropDown to Select Country
    dropDownButtons = [
        {"label": c, "method": "update", "args": 
            [{"x": [totalData[totalData["Country Name"] == c]["Year"]],
               "y": [totalData[totalData["Country Name"] == c]["Population"]],
                "marker": [{"color": totalData[totalData["Country Name"] == c]["Predicted"].map({True: "red", False: "blue"})}]},
             {"title": f"<b>Population of {c}</b>", "title_text": f"<b>Population of {c}</b>"}]
        } for c in countries
    ]
    fig.update_layout(
        updatemenus=[{
            "buttons": dropDownButtons,
            "direction": "down",
            "showactive": True,
            "x": 0.15,
            "xanchor": "left",
            "y": 1.15,
            "yanchor": "top",
        }],
        title_text=f"<b>Population of {initialCountry}</b>",
        title_x=0.5,
        title_xanchor="center"
        
    )
    fig.show()
    # fig.write_html("population.html")
    fig.write_json("population.json")



plotPopulation()
