In [None]:
!pip install pandas numpy requests beautifulsoup4 matplotlib seaborn scikit-learn

In [None]:
!pip install kaggle

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("nelgiriyewithana/countries-of-the-world-2023")

print("Path to dataset files:", path)

In [None]:
import pandas as pd
import os

# define the path
dspath = "/root/.cache/kagglehub/datasets/nelgiriyewithana/countries-of-the-world-2023/versions/1"

#list file to check the correct csv file name
os.listdir(dspath)

# load the dataset
df = pd.read_csv(os.path.join(dspath,'world-data-2023.csv'))

# display the first 5 rows
df.head()

In [None]:
df.columns

In [None]:
# extract the meaningful features

# country - name of country to identify
# population - helps to compare country size and population
# GDP - Gross Domestic Product (total value of goods/services) - measures economic strength
# Life expectancy - Average lifespan of people in the country - Indicates health & living conditions
# Birth Rate - Number of births per 1,000 people per year - Shows population growth trends
# Infant mortality - Number of infant deaths per 1,000 live births - Indicates healthcare quality
# Unemployment rate - Percentage of people unemployed in the labor force - Measures job availability & economic stability
# Tax revenue (%) - Percentage of GDP collected as tax revenue - Shows government income & economic policies
# Urban_population - Percentage of people living in urban areas - Helps analyze rural vs urban population
# Latitude, Longitude - useful for location based visualizations or for mapping data


columns_to_keep = [
    "Country", "Population", "GDP", "Life expectancy", "Birth Rate", "Infant mortality",
    "Unemployment rate", "Tax revenue (%)", "Urban_population", "Latitude", "Longitude"
]

df = df[columns_to_keep]
df

In [None]:
df.info()

In [None]:
df['Population'] = df["Population"].astype(str)
df["Population"] = df["Population"].str.replace(",", "").str.strip()  # Remove commas and spaces

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df["Population"] = pd.to_numeric(df["Population"], errors="coerce")

In [None]:
df

In [None]:
df['GDP'] = df['GDP'].astype(str)
df['GDP'] = df['GDP'].str.replace("$", "").str.strip()
df['GDP'] = df['GDP'].str.replace(",", "").str.strip()
df['GDP'] = pd.to_numeric(df['GDP'], errors = 'coerce')

In [None]:
df.info()

In [None]:
df['Unemployment rate'] = df['Unemployment rate'].astype(str)
df['Unemployment rate'] = df['Unemployment rate'].str.replace("%", "").str.strip()
df['Unemployment rate'] = pd.to_numeric(df['Unemployment rate'], errors = 'coerce')

In [None]:
df['Tax revenue (%)'] = df['Tax revenue (%)'].astype(str)
df['Tax revenue (%)'] = df['Tax revenue (%)'].str.replace("%", "").str.strip()
df['Tax revenue (%)'] = pd.to_numeric(df['Tax revenue (%)'], errors = 'coerce')

In [None]:
df

In [None]:
df['Urban_population'] = df['Urban_population'].astype(str)
df['Urban_population'] = df['Urban_population'].str.replace(",", "").str.strip()
df['Urban_population'] = pd.to_numeric(df['Urban_population'], errors = 'coerce')

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df[df.isnull().any(axis=1)]

In [None]:
df['Population'] = df['Population'].fillna(df['Population'].mean())

In [None]:
df.loc[df['Country'] == 'Japan', 'Life expectancy']

In [None]:
plt.figure(figsize=(50, 6))

# Sorting DataFrame by Life Expectancy in ascending order
df_sorted = df.sort_values(by='Life expectancy', ascending=True)

# Plotting
sns.barplot(x='Country', y='Life expectancy', data=df_sorted, palette='viridis')

plt.xticks(rotation=90)
plt.xlabel('Country')
plt.ylabel('Life Expectancy')
plt.title('Life Expectancy by Country (Sorted)')
plt.show()

In [None]:
df = df.drop(index=150)  # Removes row with index 150
df.reset_index(drop=True, inplace=True)  # Reset index after dropping the row

In [None]:
df['Unemployment rate'] = df['Unemployment rate'].fillna(df['Unemployment rate'].mean())
df['GDP'] = df['GDP'].fillna(df['GDP'].mode())

In [None]:

df.isnull().sum()

In [None]:
df[df['Life expectancy'].isnull()]

In [None]:
df.loc[3, 'Life expectancy'] = 83.5  # Works well for both single and multiple row assignments
df.loc[56, 'Life expectancy'] = 58.9
df.loc[73, 'Life expectancy'] = 84.16
df.loc[113, 'Life expectancy'] = 83.5
df.loc[120, 'Life expectancy'] = 60.6
df.loc[128, 'Life expectancy'] = 75.7
df.loc[133, 'Life expectancy'] = 73.44
df.loc[179, 'Life expectancy'] = 66.9
df.loc[180, 'Life expectancy'] = 66.9

In [None]:
df_sorted = df.sort_values(by='Birth Rate', ascending=True)
plt.figure(figsize=(150, 6))
sns.barplot(x='Country', y='Birth Rate', data=df_sorted, palette='viridis')
plt.show()

In [None]:
df[df['Birth Rate'].isnull()]

In [None]:
df.loc[56, 'Birth Rate'] = 26.1
df.loc[73, 'Birth Rate'] = 36.29
df.loc[120, 'Birth Rate'] = 24.7
df.loc[128, 'Birth Rate'] = 11.1
df.loc[133, 'Birth Rate'] = 30.9
df.loc[179, 'Birth Rate'] = 23.3
df.loc[180, 'Birth Rate'] = 23.3

In [None]:
plt.figure(figsize = (50, 6))
sns.barplot(x = 'Country', y = 'Infant mortality', data = df)
plt.plot()

In [None]:
df[df['Infant mortality'].isnull()]

In [None]:
df.loc[56, 'Infant mortality'] = 36.7
df.loc[73, 'Infant mortality'] = 517
df.loc[98, 'Infant mortality'] = 3.9
df.loc[120, 'Infant mortality'] = 7.6
df.loc[128, 'Infant mortality'] = 9.22
df.loc[133, 'Infant mortality'] = 12

In [None]:
df[df['Tax revenue (%)'].isnull()]

In [None]:
plt.figure(figsize = (50, 7))
sns.barplot(x = 'Country', y = 'Tax revenue (%)', data = df)
plt.show()

In [None]:
df.loc[3, 'Tax revenue (%)'] = 10
df.loc[24, 'Tax revenue (%)'] = 14
df.loc[34, 'Tax revenue (%)'] = 10.7
df.loc[38, 'Tax revenue (%)'] = 30
df.loc[42, 'Tax revenue (%)'] = 40.4
df.loc[47, 'Tax revenue (%)'] = 11
df.loc[50, 'Tax revenue (%)'] = 4.36
df.loc[54, 'Tax revenue (%)'] = 2
df.loc[71, 'Tax revenue (%)'] = 32.2
df.loc[72, 'Tax revenue (%)'] = 5.5
df.loc[73, 'Tax revenue (%)'] = 0
df.loc[97, 'Tax revenue (%)'] = 10
df.loc[98, 'Tax revenue (%)'] = 20
df.loc[108, 'Tax revenue (%)'] = 40
df.loc[113, 'Tax revenue (%)'] = 1
df.loc[115, 'Tax revenue (%)'] = 22.8
df.loc[120, 'Tax revenue (%)'] = 29.2
df.loc[127, 'Tax revenue (%)'] = 32
df.loc[128, 'Tax revenue (%)'] = 15
df.loc[133, 'Tax revenue (%)'] = 20
df.loc[134, 'Tax revenue (%)'] = 15
df.loc[162, 'Tax revenue (%)'] = 24.8
df.loc[179, 'Tax revenue (%)'] = 11.45
df.loc[180, 'Tax revenue (%)'] = 136.9
df.loc[189, 'Tax revenue (%)'] = 34
df.loc[191, 'Tax revenue (%)'] = 15

In [None]:
df[df['Urban_population'].isnull()]

In [None]:
df.loc[73, 'Urban_population'] = 825
df.loc[128, 'Urban_population'] = 1258951
df.loc[133, 'Urban_population'] = 4707958

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
plt.figure(figsize=(10, 5))
sns.boxplot(x = df["GDP"])
plt.show()

In [None]:
plt.figure(figsize=(10, 5))
sns.violinplot(x = df["GDP"])
plt.show()

In [None]:
import numpy as np

df["GDP_log"] = np.log1p(df["GDP"])
plt.figure(figsize=(10, 5))
sns.violinplot(x = df["GDP_log"])
plt.show()

In [None]:
lower = df["GDP"].quantile(0.05)
upper = df["GDP"].quantile(0.95)

df = df[(df["GDP"] >= lower) & (df["GDP"] <= upper)]

In [None]:
df["GDP_log"] = np.log1p(df["GDP"])
plt.figure(figsize=(10, 5))
sns.violinplot(x = df["GDP_log"])
plt.show()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(df["GDP"], bins = 30, kde = True)
plt.title("original GDP distribution")

plt.subplot(1, 2, 2)
sns.histplot(df["GDP_log"], bins = 30, kde = True)
plt.title("log-transformed GDP distribution")

plt.show()

In [None]:
plt.figure(figsize=(8, 5))
sns.boxplot(x=df["GDP_log"])
plt.title("Boxplot of Log-Transformed GDP")
plt.show()

In [None]:
plt.figure(figsize= (12, 6))
sns.violinplot(x = df['Infant mortality'])
plt.plot()

In [None]:
df["log_im"] = np.log1p(df['Infant mortality'])

plt.figure(figsize = (12, 6))
sns.violinplot(x = df['log_im'])
plt.plot()

In [None]:
lower = df['Infant mortality'].quantile(0.25)
upper = df['Infant mortality'].quantile(0.75)
iqr = upper - lower

df = df[(df['Infant mortality'] >= (lower - 1.5 * iqr)) & (df['Infant mortality'] <= (upper + 1.5 * iqr))]

In [None]:
plt.figure(figsize = (12, 6))
sns.violinplot(x = df['log_im'])
plt.plot()

In [None]:
plt.figure(figsize=(12, 6))

plt.subplot(1, 2, 1)
sns.histplot(df['Infant mortality'], bins = 30, kde = True)
plt.title("original Infant mortality distribution")

plt.subplot(1, 2, 2)
sns.histplot(df["log_im"], bins = 30, kde = True)
plt.title("log-transformed infant mortality distribution")

plt.show()

In [None]:
plt.figure(figsize = (16, 8))
sns.histplot(x = np.log1p(df['Population']))
plt.show()

In [None]:
df.columns

In [None]:
import plotly.express as px

fig = px.scatter_geo(df,
                     lat="Latitude", lon="Longitude",  # Use your actual column names
                     size="Population",  # Bubble size based on population
                     color="GDP",  # Color based on GDP per capita
                     hover_name="Country",
                     projection="natural earth",  # Map projection type
                     title="Countries by GDP per Capita")

fig.show()