In [10]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score

In [5]:
df = pd.read_csv('dataNameEncoded.csv')

In [6]:
df.head(2)

Unnamed: 0,name_0,name_1,name_2,name_3,name_4,name_5,name_6,name_7,health_expenditure,who_region,...,year,population,life_expectancy,health_life_expectancy,Number of new HIV infections,Suicide deaths,Adult obesity%,Tobacco use%,Alcohol consumption,Prevalence of hypertension%
0,0,0,0,0,0,0,0,1,21.83,2,...,2000,0.0,55.0,46.8,0.0,4.1,2.1,20.0,0.0,44.249837
1,0,0,0,0,0,0,0,1,21.83,2,...,2001,0.0,55.49,47.23,0.0,4.9,2.1,20.0,0.0,35.839694


In [14]:
target_indicators = [ 'Adult obesity%',
                     'Tobacco use%', 'Alcohol consumption','Number of new HIV infections', 'Suicide deaths', 'Prevalence of hypertension%']


In [15]:
feature_columns = ['health_expenditure', 'who_region', 'world_bank_income_level',
                   'population growth rate%', 'year', 'population', 'life_expectancy', 
                   'health_life_expectancy'] + list(df.columns[df.columns.str.startswith('name_')])


In [22]:

for indicator in target_indicators:
    indicator_data = df.dropna(subset=[indicator])
    X = indicator_data[feature_columns]
    y = indicator_data[indicator]

    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    models = {
        "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
        "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
        "NeuralNetwork": MLPRegressor(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
    }
    
    best_score = float('-inf')
    best_model = None
    best_model_name = ""
    
    for model_name, model in models.items():
        model.fit(X_scaled, y)
        
        y_pred = model.predict(X_scaled)
        score = r2_score(y, y_pred)
        
        print(f"{model_name} R^2 score for {indicator}: {score}")
        
        if score > best_score:
            best_score = score
            best_model = model
            best_model_name = model_name

    print(f"Best model for {indicator}: {best_model_name} with R^2 score of {best_score}\n")
    
    with open(f'../models/pklFiles/scaler_{indicator}.pkl', 'wb') as file:
        pickle.dump(scaler, file)
    with open(f'../models//pklFiles/best_model_{indicator}.pkl', 'wb') as file:
        pickle.dump(best_model, file)


RandomForest R^2 score for Adult obesity%: 0.8428389913535146
GradientBoosting R^2 score for Adult obesity%: 0.08267773364101139
NeuralNetwork R^2 score for Adult obesity%: 0.24632928661664122
Best model for Adult obesity%: RandomForest with R^2 score of 0.8428389913535146

RandomForest R^2 score for Tobacco use%: 0.8447471626619104
GradientBoosting R^2 score for Tobacco use%: 0.10535818176160816
NeuralNetwork R^2 score for Tobacco use%: 0.17474696496782516
Best model for Tobacco use%: RandomForest with R^2 score of 0.8447471626619104

RandomForest R^2 score for Alcohol consumption: 0.8431718472325553
GradientBoosting R^2 score for Alcohol consumption: 0.11094228102738679
NeuralNetwork R^2 score for Alcohol consumption: 0.29657409210857155
Best model for Alcohol consumption: RandomForest with R^2 score of 0.8431718472325553

RandomForest R^2 score for Number of new HIV infections: 0.9965105613931504
GradientBoosting R^2 score for Number of new HIV infections: 0.9101183519448056
NeuralN

In [None]:
#### gettting prediction


import pickle
import pandas as pd

# Load encoder
with open('../encoders/country_binary_encoder.pkl', 'rb') as file:
    binary_encoder = pickle.load(file)

# Load scaler and model for a specific indicator (e.g., 'Suicide deaths')
indicator = 'Suicide deaths'
with open(f'../models/scaler_{indicator}.pkl', 'rb') as file:
    scaler = pickle.load(file)
with open(f'../models/model_{indicator}.pkl', 'rb') as file:
    model = pickle.load(file)


# Example input data
input_data = {
    'name': 'Afghanistan',
    'health_expenditure': 22.0,
    'who_region': 2,
    'world_bank_income_level': 1,
    'population growth rate%': 2.5,
    'year': 2025,
    'population': 39000000,
    'life_expectancy': 60.0,
    'health_life_expectancy': 50.0
}

# Convert to DataFrame
input_df = pd.DataFrame([input_data])


# Apply the binary encoder to the 'name' column
input_df_encoded = binary_encoder.transform(input_df)

# Extract feature columns for the model
feature_columns = [
    'health_expenditure', 'who_region', 'world_bank_income_level',
    'population growth rate%', 'year', 'population', 'life_expectancy', 
    'health_life_expectancy'
] + list(input_df_encoded.columns[input_df_encoded.columns.str.startswith('name_')])

# Scale the data
input_scaled = scaler.transform(input_df_encoded[feature_columns])


# Predict the target indicator (e.g., 'Suicide deaths') for the input data
prediction = model.predict(input_scaled)

print(f"Predicted {indicator}: {prediction[0]}")
