In [None]:
! pip install plotly
! pip install Streamlit
! pip install folium
! pip install fuzzywuzzy
! pip install python-Levenshtein
! pip install pycountry-convert
! pip install streamlit-folium
! pip install branca
! pip install joblib


In [1]:
# import pandas for structuring the data
import pandas as pd

# import numpy for numerical analysis
import numpy as np

# import libs for diagrams inline with the text
import matplotlib.pyplot as plt
import os
import seaborn as sns

# other utilities
from sklearn import datasets, preprocessing, metrics

In [2]:
# for visualisation
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.io as pio
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm

# for diagramming 
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import seaborn as sns

# For serialization and deserialization of data from/to file
import pickle

In [3]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer


In [4]:
import folium

In [5]:
# read the json file from your data folder into a data frame
#df = pd.read_csv(r'C:\Users\chz\Documents\BI Exercise\\BI Exam\global air pollution dataset.csv')

# Correctly constructing the file path
dataset_path = os.path.join('DataSæt', 'global air pollution dataset.csv')
dataset_path2 = os.path.join('DataSæt', '2017_-_Cities_Community_Wide_Emissions.csv')

# Loading the datasets
df = pd.read_csv(dataset_path)
df2 = pd.read_csv(dataset_path2)


In [6]:
df.shape

(23463, 12)

In [7]:
df2.shape

(229, 31)

In [8]:
# Define the correction mapping with the correct capitalization
correction_mapping = {
    "United States of America": "USA",
    "Viet Nam": "Vietnam",
    "Russian Federation": "Russia",
    "United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
    "Bolivia (Plurinational State of)": "Bolivia",
    "Venezuela (Bolivarian Republic of)": "Venezuela",
    "Iran (Islamic Republic of)": "Iran",
    "Syrian Arab Republic": "Syria",
    "Republic of Korea": "South Korea",
    "Lao People's Democratic Republic": "Laos",
    # Add other corrections as needed
}

# Apply the correction mapping to df and df2 and overwrite the original 'Country' column
df['Country'] = df['Country'].replace(correction_mapping).str.strip()
df2['Country'] = df2['Country'].replace(correction_mapping).str.strip()

# Proceed with the merge using the corrected country names
df_merged = pd.merge(df, df2, on='Country', how='inner')

# Rename 'City_x' to 'City' and 'Country_x' to 'Country'
df_merged.rename(columns={'City_x': 'City',}, inplace=True)

# Drop the extra 'Country' column
df_merged.drop(columns=['City_y'], inplace=True)

# Rearrange the columns
column_order = ['Country', 'City', 'AQI Value', 'AQI Category', 'CO AQI Value', 'CO AQI Category', 'Ozone AQI Value', 'Ozone AQI Category', 'NO2 AQI Value', 'NO2 AQI Category', 'PM2.5 AQI Value', 'PM2.5 AQI Category', 'Account number', 'Organization', 'Region', 'C40', 'Access', 'Reporting year', 'Accounting year', 'Boundary', 'Protocol', 'Protocol column', 'Gases included', 'Total emissions (metric tonnes CO2e)', 'Total Scope 1 Emissions (metric tonnes CO2e)', 'Total Scope 2 Emissions (metric tonnes CO2e)', 'Comment', 'Increase/Decrease from last year', 'Reason for increase/decrease in emissions', 'Population', 'Population year', 'GDP', 'GDP Currency', 'GDP Year', 'GDP Source', 'Average annual temperature (in Celsius)​', '​Average altitude (m)', '​Land area (in square km)', 'City Location', 'Country Location']
# Reorder the DataFrame columns
df_merged = df_merged[column_order]


In [9]:
df_merged.shape

(288622, 40)

In [10]:
df_merged.isnull().sum()

Country                                              0
City                                                 1
AQI Value                                            0
AQI Category                                         0
CO AQI Value                                         0
CO AQI Category                                      0
Ozone AQI Value                                      0
Ozone AQI Category                                   0
NO2 AQI Value                                        0
NO2 AQI Category                                     0
PM2.5 AQI Value                                      0
PM2.5 AQI Category                                   0
Account number                                       0
Organization                                         0
Region                                               0
C40                                             237710
Access                                               0
Reporting year                                       0
Accounting

In [11]:
df_merged.drop(columns=['Gases included'], inplace=True)
df_merged.drop(columns=['Protocol column'], inplace=True)
df_merged.drop(columns=['Comment'], inplace=True)
df_merged.drop(columns=['Total Scope 1 Emissions (metric tonnes CO2e)'], inplace=True)
df_merged.drop(columns=['Total Scope 2 Emissions (metric tonnes CO2e)'], inplace=True)
df_merged.drop(columns=['Account number'], inplace=True)
df_merged.drop(columns=['Organization'], inplace=True)
df_merged.drop(columns=['Accounting year'], inplace=True)
df_merged.drop(columns=['Boundary'], inplace=True)
df_merged.drop(columns=['Protocol'], inplace=True)
df_merged.drop(columns=['Increase/Decrease from last year'], inplace=True)
df_merged.drop(columns=['Reason for increase/decrease in emissions'], inplace=True)
df_merged.drop(columns=['Population year'], inplace=True)
df_merged.drop(columns=['GDP Currency'], inplace=True)
df_merged.drop(columns=['GDP Source'], inplace=True)
df_merged.drop(columns=['Access'], inplace=True)



In [12]:
# Convert nulls/NaNs to 'False'
df_merged['C40'] = df_merged['C40'].fillna('False')

# Convert any cell that contains "C40" to 'True', assuming "C40" indicates a true condition
# Adjust the condition as needed to match your data's specific representation of true
df_merged['C40'] = df_merged['C40'].apply(lambda x: 'True' if 'C40' in str(x) else 'False')


In [13]:
# Delete the null values from the data frame
df_merged = df_merged.dropna()

In [14]:
# Check the data frame for null values
df_merged.isnull().sum()

Country                                     0
City                                        0
AQI Value                                   0
AQI Category                                0
CO AQI Value                                0
CO AQI Category                             0
Ozone AQI Value                             0
Ozone AQI Category                          0
NO2 AQI Value                               0
NO2 AQI Category                            0
PM2.5 AQI Value                             0
PM2.5 AQI Category                          0
Region                                      0
C40                                         0
Reporting year                              0
Total emissions (metric tonnes CO2e)        0
Population                                  0
GDP                                         0
GDP Year                                    0
Average annual temperature (in Celsius)​    0
​Average altitude (m)                       0
​Land area (in square km)         

In [15]:
df_merged.count()

Country                                     195190
City                                        195190
AQI Value                                   195190
AQI Category                                195190
CO AQI Value                                195190
CO AQI Category                             195190
Ozone AQI Value                             195190
Ozone AQI Category                          195190
NO2 AQI Value                               195190
NO2 AQI Category                            195190
PM2.5 AQI Value                             195190
PM2.5 AQI Category                          195190
Region                                      195190
C40                                         195190
Reporting year                              195190
Total emissions (metric tonnes CO2e)        195190
Population                                  195190
GDP                                         195190
GDP Year                                    195190
Average annual temperature (in 

In [16]:
# Extracting latitude and longitude from "City Location" and "Country Location" into new columns
df_merged[['City Latitude', 'City Longitude']] = df_merged['City Location'].str.extract(r'\(([^,]+), ([^)]+)\)')
df_merged[['Country Latitude', 'Country Longitude']] = df_merged['Country Location'].str.extract(r'\(([^,]+), ([^)]+)\)')

# Displaying the first few rows to ensure the transformation was successful
df_merged.head()

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,...,GDP Year,Average annual temperature (in Celsius)​,​Average altitude (m),​Land area (in square km),City Location,Country Location,City Latitude,City Longitude,Country Latitude,Country Longitude
1241,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,...,2012.0,26.0,8.0,692.0,"(-12.97304, -38.502304)","(-14.235004, -51.92528)",-12.97304,-38.502304,-14.235004,-51.92528
1242,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,...,2013.0,23.0,5.0,133.1,"(-22.892857, -43.118381)","(-14.235004, -51.92528)",-22.892857,-43.118381,-14.235004,-51.92528
1243,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,...,2013.0,20.0,3.0,438.0,"(-27.5949884, -48.5481743)","(-14.235004, -51.92528)",-27.5949884,-48.5481743,-14.235004,-51.92528
1244,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,...,2014.0,21.0,900.0,331.0,"(-19.916681, -43.934493)","(-14.235004, -51.92528)",-19.916681,-43.934493,-14.235004,-51.92528
1246,Brazil,Presidente Dutra,41,Good,1,Good,5,Good,1,Good,...,2010.0,23.2,749.0,739.0,"(-16.6868912, -49.2647943)","(-14.235004, -51.92528)",-16.6868912,-49.2647943,-14.235004,-51.92528


In [17]:
# Convert the latitude and longitude columns from strings to floats
df_merged['City Latitude'] = pd.to_numeric(df_merged['City Latitude'], errors='coerce')
df_merged['City Longitude'] = pd.to_numeric(df_merged['City Longitude'], errors='coerce')
df_merged['Country Latitude'] = pd.to_numeric(df_merged['Country Latitude'], errors='coerce')
df_merged['Country Longitude'] = pd.to_numeric(df_merged['Country Longitude'], errors='coerce')

In [18]:
df_merged.drop(columns=['City Location'], inplace=True)
df_merged.drop(columns=['Country Location'], inplace=True)

In [19]:
# Convert 'C40' from strings "True"/"False" to actual booleans
df_merged['C40'] = df_merged['C40'].map({'True': True, 'False': False})

# Create two new columns: 'C40_True' and 'C40_False'
df_merged['C40_True'] = df_merged['C40'].astype(int)  # This will convert True to 1 and False to 0
df_merged['C40_False'] = (~df_merged['C40']).astype(int)  # This inverts the boolean and then converts to 0/1



In [20]:
df_merged.drop(columns=['C40'], inplace=True)

In [21]:
df = df_merged

In [22]:
df.sample()

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,...,GDP Year,Average annual temperature (in Celsius)​,​Average altitude (m),​Land area (in square km),City Latitude,City Longitude,Country Latitude,Country Longitude,C40_True,C40_False
81588,USA,Defiance,52,Moderate,1,Good,22,Good,7,Good,...,2014.0,11.0,15.0,3692.0,47.6062,-122.3321,37.09024,-95.712891,1,0


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 195190 entries, 1241 to 288621
Data columns (total 27 columns):
 #   Column                                    Non-Null Count   Dtype  
---  ------                                    --------------   -----  
 0   Country                                   195190 non-null  object 
 1   City                                      195190 non-null  object 
 2   AQI Value                                 195190 non-null  int64  
 3   AQI Category                              195190 non-null  object 
 4   CO AQI Value                              195190 non-null  int64  
 5   CO AQI Category                           195190 non-null  object 
 6   Ozone AQI Value                           195190 non-null  int64  
 7   Ozone AQI Category                        195190 non-null  object 
 8   NO2 AQI Value                             195190 non-null  int64  
 9   NO2 AQI Category                          195190 non-null  object 
 10  PM2.5 AQI Value       

In [24]:
import pycountry_convert as pc

#applying continent to the dataset for future use of folium mapping
def country_to_continent(country_name):
    try:
        country_alpha2 = pc.country_name_to_country_alpha2(country_name)
        country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
        country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
        return country_continent_name
    except:
        return None  # For countries that don't match

# Apply the conversion function to your DataFrame
df['Continent'] = df['Country'].apply(country_to_continent)
# Filter for other continents
north_american_countries_df = df[df['Continent'] == 'North America']
south_american_countries_df = df[df['Continent'] == 'South America']
asian_countries_df = df[df['Continent'] == 'Asia']
african_countries_df = df[df['Continent'] == 'Africa']
oceania_countries_df = df[df['Continent'] == 'Oceania']
Europe_df = df[df['Continent'] == 'Europe']






In [25]:
df.count()

Country                                     195190
City                                        195190
AQI Value                                   195190
AQI Category                                195190
CO AQI Value                                195190
CO AQI Category                             195190
Ozone AQI Value                             195190
Ozone AQI Category                          195190
NO2 AQI Value                               195190
NO2 AQI Category                            195190
PM2.5 AQI Value                             195190
PM2.5 AQI Category                          195190
Region                                      195190
Reporting year                              195190
Total emissions (metric tonnes CO2e)        195190
Population                                  195190
GDP                                         195190
GDP Year                                    195190
Average annual temperature (in Celsius)​    195190
​Average altitude (m)          

In [26]:
df.sample(10)

Unnamed: 0,Country,City,AQI Value,AQI Category,CO AQI Value,CO AQI Category,Ozone AQI Value,Ozone AQI Category,NO2 AQI Value,NO2 AQI Category,...,Average annual temperature (in Celsius)​,​Average altitude (m),​Land area (in square km),City Latitude,City Longitude,Country Latitude,Country Longitude,C40_True,C40_False,Continent
22487,Brazil,Angatuba,18,Good,1,Good,8,Good,1,Good,...,21.0,1172.0,5780.0,-15.794229,-47.882166,-14.235004,-51.92528,0,1,South America
16354,Brazil,Palmas,163,Unhealthy,6,Good,4,Good,3,Good,...,18.0,935.0,434.0,-25.431063,-49.264693,-14.235004,-51.92528,1,0,South America
192378,USA,Speedway,56,Moderate,1,Good,39,Good,2,Good,...,32.0,292.0,132.0,34.0007,-81.0348,37.09024,-95.712891,0,1,North America
70523,USA,Brooklyn Center,51,Moderate,1,Good,31,Good,6,Good,...,6.5,2134.0,165.0,35.1992,-111.6311,37.09024,-95.712891,0,1,North America
97924,USA,Colchester,39,Good,1,Good,24,Good,9,Good,...,6.0,2405.0,20.0,39.195,-106.837,37.09024,-95.712891,0,1,North America
49690,USA,Annandale,50,Good,1,Good,50,Good,1,Good,...,13.0,168.0,124.0,39.1031,-84.512,37.09024,-95.712891,0,1,North America
185831,USA,Topsham,46,Good,1,Good,21,Good,6,Good,...,10.4,199.0,201.0,41.4993,-81.6944,37.09024,-95.712891,0,1,North America
273960,United Kingdom,Carluke,31,Good,1,Good,15,Good,8,Good,...,9.7,65.0,140.0,51.481581,-3.17909,55.378051,-3.435973,0,1,Europe
88928,USA,Gautier,51,Moderate,1,Good,42,Good,1,Good,...,15.0,15.0,40.0,37.72493,-122.156077,37.09024,-95.712891,0,1,North America
218024,USA,Fernley,34,Good,1,Good,34,Good,3,Good,...,6.5,2134.0,165.0,35.1992,-111.6311,37.09024,-95.712891,0,1,North America


In [27]:
df = df.drop_duplicates(subset=['City'])


In [28]:
df.count()

Country                                     13408
City                                        13408
AQI Value                                   13408
AQI Category                                13408
CO AQI Value                                13408
CO AQI Category                             13408
Ozone AQI Value                             13408
Ozone AQI Category                          13408
NO2 AQI Value                               13408
NO2 AQI Category                            13408
PM2.5 AQI Value                             13408
PM2.5 AQI Category                          13408
Region                                      13408
Reporting year                              13408
Total emissions (metric tonnes CO2e)        13408
Population                                  13408
GDP                                         13408
GDP Year                                    13408
Average annual temperature (in Celsius)​    13408
​Average altitude (m)                       13408


In [29]:
df.to_pickle("dataframe.pkl") # save df to a pickle file so it can be used for streamlit

In [None]:
    import pandas as pd
    import joblib
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import accuracy_score, classification_report
    from sklearn.pipeline import Pipeline
    from sklearn.compose import ColumnTransformer
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.impute import SimpleImputer
    from sklearn.feature_selection import RFECV

    # Load your DataFrame here
    # For example: df = pd.read_csv('your_data.csv')
    # Make sure to replace this with your actual data loading code

    # Separate the features and the target
    X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
    y = df['C40_True']

    # Identifying numeric and categorical features
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object']).columns.tolist()

    # Preprocessing pipelines for both numeric and categorical data
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

    # Feature selection integrated within the classifier pipeline
    rf = RandomForestClassifier(random_state=42)
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                            ('feature_selection', RFECV(estimator=rf, step=1, cv=5, scoring='accuracy')),
                            ('classifier', RandomForestClassifier(random_state=42))])

    # Hyperparameter tuning setup for the classifier after feature selection
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [None, 10, 20]
        # Add more parameters here if needed
    }

    # Initialize GridSearchCV with the pipeline
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

    # Fit GridSearchCV to find the best model
    grid_search.fit(X, y)

    # Best hyperparameters and score
    print("Best parameters:", grid_search.best_params_)
    print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

    # Evaluate on the test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    print(f"Test set accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

    # Save the best model
    joblib.dump(best_model, 'best_rf_model.joblib')

In [None]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV

# Load your DataFrame here

# Example: df = pd.read_csv('your_data.csv')
# Make sure to replace this with your actual data loading code

# Separate the features and the target
X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
y = df['C40_True']

# Identifying numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

# Preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

# Feature selection integrated within the classifier pipeline
rf = RandomForestClassifier(random_state=42)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rf)])

# Hyperparameter tuning setup for the classifier after feature selection
param_distributions = {
    'classifier__n_estimators': [100, 150],  # Reduced number of options for speed
    'classifier__max_depth': [None, 10],  # Simplified to speed up
    # Simplify other parameters as needed
}

# Initialize RandomizedSearchCV with the pipeline
random_search = RandomizedSearchCV(pipeline, param_distributions, n_iter=10, cv=5, scoring='accuracy', n_jobs=-1, random_state=42)

# Fit RandomizedSearchCV to find the best model more efficiently
random_search.fit(X, y)

# Best hyperparameters and score
print("Best parameters:", random_search.best_params_)
print("Best cross-validation score: {:.2f}".format(random_search.best_score_))

# Evaluate on the test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Test set accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Save the best model
joblib.dump(best_model, 'best_rf_model.joblib')


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import joblib

# Assuming 'df' is your DataFrame
X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
y = df['C40_True']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(exclude=['int64', 'float64']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

rf = RandomForestClassifier(random_state=42)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', rf)])

param_distributions = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Setting up RandomizedSearchCV
random_search = RandomizedSearchCV(pipeline, param_distributions=param_distributions, n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit the model
random_search.fit(X, y)

print("Best parameters:", random_search.best_params_)
print("Best score:", random_search.best_score_)

# Get the best model
best_model = random_search.best_estimator_

# Feature Importances
if 'classifier' in best_model.named_steps:
    importances = best_model.named_steps['classifier'].feature_importances_
    features = numeric_features.tolist() + categorical_features.tolist()  # Adjust as necessary
    feature_importance_dict = dict(zip(features, importances))
    print("Feature importances:", feature_importance_dict)

# Save the best model
joblib.dump(best_model, 'best_rf_model_with_cv_and_regularization.joblib')


In [None]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load your DataFrame here
# For example: df = pd.read_csv('your_data.csv')
# Make sure to replace this with your actual data loading code
df = ...

# Drop 'C40_True' from the features since it's the target variable
X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
y = df['C40_True']

# Identifying numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

# Preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create a full pipeline with both preprocessing and the classifier
full_pipeline = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))

# Hyperparameter tuning setup
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20],
    # You can add more parameters here
}

# Initialize GridSearchCV
grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to find the best model
grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Evaluate the best model found by GridSearchCV on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Test set accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Save the best model for later use
joblib.dump(best_model, 'best_rf_model.joblib')


In [None]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

# Load your DataFrame here
# For example: df = pd.read_csv('your_data.csv')
# Make sure to replace this with your actual data loading code
# df = ...

# Separate the features and the target variable
X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
y = df['C40_True']

# Identifying numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

# Preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Feature selection integrated within the classifier pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', RFECV(estimator=LogisticRegression(), step=1, cv=5, scoring='f1')),
    ('classifier', RandomForestClassifier(random_state=42))])

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Training the model
pipeline.fit(X_train, y_train)

# Extracting feature names
numeric_feature_names = numeric_features
categorical_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out()
feature_names = list(numeric_feature_names) + list(categorical_feature_names)

# Extracting feature importances
importances = pipeline.named_steps['classifier'].feature_importances_
feature_importances = pd.DataFrame(sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True), columns=['Feature', 'Importance'])

# Display feature importances
print(feature_importances.head())

# Evaluating the model
y_pred = pipeline.predict(X_test)
print(f"Model F1-score: {f1_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Saving the model and feature importances for later use
joblib.dump(pipeline, 'finalized_model.joblib')
joblib.dump(feature_importances, 'feature_importances.joblib')


In [None]:
test_df = dataset_path
test_df2 = dataset_path2

In [None]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression

# Ensure df is your DataFrame loaded with the data
# df = pd.read_csv('path/to/your_data.csv')

# Separate the features and the target variable
X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
y = df['C40_True']

# Identifying numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

# Preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Feature selection integrated within the classifier pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', RFECV(estimator=LogisticRegression(max_iter=1000), step=1, cv=5, scoring='f1')),
    ('classifier', RandomForestClassifier(random_state=42))])

# Performing cross-validation
cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='f1_macro')
print(f"Mean F1-score from CV: {cv_scores.mean()} ± {cv_scores.std()}")

# Splitting data for final training and testing (optional, as cross-validation already evaluates the model)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Training the model on the entire dataset or the training set only
pipeline.fit(X_train, y_train)

# Extracting feature names correctly after fitting the model
try:
    categorical_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
except AttributeError:  # For sklearn versions prior to 0.24
    categorical_feature_names = categorical_transformer.get_feature_names(categorical_features)
feature_names = numeric_features + list(categorical_feature_names)

# Extracting and displaying feature importances
importances = pipeline.named_steps['classifier'].feature_importances_
feature_importances = pd.DataFrame(sorted(zip(feature_names, importances), key=lambda x: x[1], reverse=True), columns=['Feature', 'Importance'])
print(feature_importances.head())

# Evaluating the model on the test set
y_pred = pipeline.predict(X_test)
print(f"Model F1-score on the test set: {f1_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Saving the model and feature importances for later use
joblib.dump(pipeline, 'finalized_model.joblib')
joblib.dump(feature_importances, 'feature_importances.joblib')


In [None]:
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# Load your DataFrame here
# For example: df = pd.read_csv('your_data.csv')
# Make sure to replace this with your actual data loading code


# Drop 'C40_True' from the features since it's the target variable
X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
y = df['C40_True']

# Identifying numeric and categorical features
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

# Preprocessing pipelines for both numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create a full pipeline with both preprocessing and the classifier
full_pipeline = make_pipeline(preprocessor, RandomForestClassifier(random_state=42))

# Hyperparameter tuning setup
param_grid = {
    'randomforestclassifier__n_estimators': [100, 200],
    'randomforestclassifier__max_depth': [None, 10, 20],
    # You can add more parameters here
}

# Initialize GridSearchCV
grid_search = GridSearchCV(full_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to find the best model
grid_search.fit(X, y)
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Evaluate the best model found by GridSearchCV on the test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(f"Test set accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Save the best model for later use
joblib.dump(best_model, 'best_rf_model.joblib')


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
import pandas as pd
import joblib

# Assuming df is loaded correctly
# Example: df = pd.read_csv('your_data.csv')

X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
y = df['C40_True']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Adjusted RandomForestClassifier parameters to prevent overfitting
rf = RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_depth=10, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', RFECV(estimator=LogisticRegression(max_iter=1000), step=1, cv=10, scoring='accuracy')),
    ('classifier', rf)])

# Perform cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())

# Save the model
joblib.dump(pipeline, 'less_overfitting_model.joblib')


In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
import pandas as pd
import joblib

# Assuming df is loaded with your data
# This is just an example setup, replace df with your actual dataframe

X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1)
y = df['C40_True']

numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(exclude=['int64', 'float64']).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Adjusted RandomForestClassifier parameters to prevent overfitting
rf = RandomForestClassifier(n_estimators=100, min_samples_split=5, min_samples_leaf=2, max_depth=10, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', RFECV(estimator=LogisticRegression(max_iter=1000), step=1, cv=10, scoring='accuracy')),
    ('classifier', rf)])

# Perform cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
print("Cross-Validation Accuracy Scores:", scores)
print("Mean Accuracy:", scores.mean())

# Saving the model for later use
joblib.dump(pipeline, 'less_overfitting_model.joblib')


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

# Placeholder for loading your dataset
# Ensure to load your actual dataset here
# df = pd.read_csv('path/to/your_data.csv')

# Separate features and target
X = df.drop(['C40_True', 'C40_False', 'Country', 'City', 'Continent'], axis=1, errors='ignore')
y = df['C40_True']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing for numeric columns (scale them)
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Define preprocessing for categorical features (encode them)
categorical_features = X.select_dtypes(include=['object']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Create preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))])

# Fit the pipeline to train a RandomForest model on the training set
pipeline.fit(X_train, y_train)

# Feature importance
importances = pipeline.named_steps['classifier'].feature_importances_
feature_names = numeric_features.tolist() + list(pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out())
feature_importances = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
feature_importances.sort_values(by='Importance', ascending=False, inplace=True)

# Save the model and feature importances
joblib.dump(pipeline, 'finalized_model.joblib')
joblib.dump(feature_importances, 'feature_importances.joblib')


print(feature_importances.head())


In [None]:
df.count()

In [None]:
# We have to convert the data from float to int in order to use pandas to calculate the correlations
numeric_df = df.select_dtypes(include=['float64', 'int64'])


df_cleaned = numeric_df.dropna()
# Calculate the correlation matrix
corr_matrix = df_cleaned.corr()

# Plot the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

In [None]:
# Visualizing relationships between all numerical features
sns.pairplot(df.select_dtypes(include=['float64', 'int64']))
plt.show()


In [None]:
plt.xlabel('AQI Value')
plt.ylabel('PM2.5 AQI Value')
plt.scatter(df['AQI Value'], df['PM2.5 AQI Value'], color='green')
plt.show()

In [None]:
sns.distplot(df['AQI Value'],  label='AQI Value', norm_hist=True)  

In [None]:
sns.distplot(df['PM2.5 AQI Value'],  label='PM2.5 AQI Value', norm_hist=True) 

In [None]:
# Grouping the data by 'Country' and calculating the mean 'AQI Value' for each country
country_aqi_means = df.groupby('Country')['AQI Value'].mean()

# Sorting the countries by AQI value for better visualization
country_aqi_means = country_aqi_means.sort_values()

# Creating the bar chart
plt.figure(figsize=(15, 25)) 
plt.barh(country_aqi_means.index, country_aqi_means.values, color='skyblue') # Horizontal bar chart
plt.xlabel('Average AQI Value')
plt.ylabel('Country')
plt.title('Average AQI Value by Country')
plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area.

plt.show()

In [None]:
# Grouping the data by 'Country' and calculating the mean 'PM2.5 AQI Value' for each country
country_pm25_means = df.groupby('Country')['PM2.5 AQI Value'].mean()
# Sorting the countries by PM2.5 AQI value for better visualization
country_pm25_means = country_pm25_means.sort_values()
# Creating the bar chart
plt.figure(figsize=(15, 25))
plt.barh(country_pm25_means.index, country_pm25_means.values, color='skyblue') # Horizontal bar chart
plt.xlabel('Average PM2.5 AQI Value')
plt.ylabel('Country')
plt.title('Average PM2.5 AQI Value by Country')
plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area.
plt.show()


In [None]:
X = df['AQI Value'].values.reshape(-1, 1)
y = df['PM2.5 AQI Value'].values.reshape(-1, 1)

In [None]:
# plot all
plt.ylabel('PM2.5 AQI Value')
plt.xlabel('AQI Value')
plt.scatter(X, y, color='blue')
plt.show()

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123, test_size=0.15) 

In [None]:
# the shape of the subsets
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# creating an instance of Linear Regression model
myreg = LinearRegression()

In [None]:
# fit it to our data
myreg.fit(X_train, y_train)
myreg

In [None]:
# get the calculated coefficients
a = myreg.coef_
b = myreg.intercept_

In [None]:
a

In [None]:
b

In [None]:
y_predicted = myreg.predict(X_test)
y_predicted

In [None]:
y_test

In [None]:
# Visualise the Linear Regression 
plt.title('Linear Regression')
plt.scatter(X, y, color='green')
plt.plot(X_train, a*X_train + b, color='blue')
plt.plot(X_test, y_predicted, color='orange')
plt.xlabel('length')
plt.ylabel('age')
plt.show()


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Predicting on the test set
y_pred = myreg.predict(X_test)

# Calculating metrics
print("R^2: ", r2_score(y_test, y_pred))
print("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
#create a scatter plot of the 'AQI Value' and 'PM2.5 AQI Value' columns and color the points by the 'Country' column
fig = px.scatter(df, x='AQI Value', y='PM2.5 AQI Value', color='Country', title='AQI Value vs PM2.5 AQI Value')
fig.show()


In [None]:
# divide the data into 5 clusters using the KMeans algorithm
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=5)
kmeans.fit(df[['AQI Value', 'PM2.5 AQI Value']])
df['cluster'] = kmeans.predict(df[['AQI Value', 'PM2.5 AQI Value']])
df.sample(10)



In [None]:
# create a scatter plot of the 'AQI Value' and 'PM2.5 AQI Value' columns and color the points by the 'cluster' column
fig = px.scatter(df, x='AQI Value', y='PM2.5 AQI Value', color='cluster', title='AQI Value vs PM2.5 AQI Value')
fig.show()


In [None]:
# Fit the model and predict clusters
kmeans = KMeans(n_clusters=5, random_state=42).fit(df_filtered[['AQI Value', 'PM2.5 AQI Value']])
df_filtered['cluster'] = kmeans.labels_

# Analyze centroids
centroids = kmeans.cluster_centers_
print("Centroids:\n", centroids)

# Plotting clusters and centroids
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_filtered, x='AQI Value', y='PM2.5 AQI Value', hue='cluster', palette='viridis')
plt.scatter(centroids[:, 0], centroids[:, 1], s=100, c='red', label='Centroids')
plt.legend()
plt.show()


In [None]:
population_data = df[['Country', 'Population']]

# Grouping and aggregating population data by country
population_by_country = population_data.groupby('Country')['Population'].sum().reset_index()

# Creating a pivot table with 'Country' as index
pivot_population = population_by_country.set_index('Country')

# Creating the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(data=pivot_population, cmap='YlGnBu', annot=True, fmt=',.0f', linewidths=.5)
plt.title('Population by Country')
plt.xlabel('population')
plt.ylabel('Country')
plt.show()

In [None]:
# Splitting the 'Country Location' column into separate longitude and latitude columns
df[['Latitude', 'Longitude']] = df['Country Location'].str.strip('()').str.split(', ', expand=True).astype(float)

# Creating a 3D scatter plot
scatter_plot = go.Scatter3d(
    x=df['Longitude'],
    y=df['Latitude'],
    z=df['Population year'],
    mode='markers',
    marker=dict(
        size=5,
        color='blue',                # Set color to an array/list of desired values
        opacity=0.8
    )
)

# Setting layout
layout = go.Layout(
    title='3D Population Map',
    scene=dict(
        xaxis=dict(title='Country Longitude'),
        yaxis=dict(title='Country Latitude'),
        zaxis=dict(title='Population')
    )
)

# Combining data and layout into a figure
fig = go.Figure(data=[scatter_plot], layout=layout)

# Show the figure
fig.show()

In [None]:
df.sample()

In [None]:
filtered_df = df[df['Country'] == 'Russian Federation']

In [None]:
filtered_df.sample()

In [None]:
df_europe = df[df['Country'] == 'USA']

In [None]:
df_europe.sample(50)

In [None]:
df_sampled = df_europe.sample(n=500, replace=False, random_state=42) 

In [None]:
df_sampled.count()

In [None]:
import folium
import json

# Load the GeoJSON data from a local file
with open(r"C:\Users\chz\Documents\BI Exercise\Datasæt\Eumap.json", 'r', encoding='utf-8') as f:
    geojson_data = json.load(f)

# Assuming 'df' is your DataFrame and already correctly set up
m = folium.Map(location=[df_sampled['Country Latitude'].mean(), df_sampled['Country Longitude'].mean()], zoom_start=3)

# Add markers for each data point
for index, row in df.iterrows():
    folium.Marker([row['Country Latitude'], row['Country Longitude']], popup=row['Population']).add_to(m)

# Add polygon overlays for countries using the loaded GeoJSON data
folium.GeoJson(data=geojson_data).add_to(m)

# Save and display the map
m.save('map.html')


In [None]:
m