In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import joblib
import re
import warnings
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline


In [2]:
data = pd.read_csv('2019.csv')
data

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,Finland,7.769,1.340,1.587,0.986,0.596,0.153,0.393
1,2,Denmark,7.600,1.383,1.573,0.996,0.592,0.252,0.410
2,3,Norway,7.554,1.488,1.582,1.028,0.603,0.271,0.341
3,4,Iceland,7.494,1.380,1.624,1.026,0.591,0.354,0.118
4,5,Netherlands,7.488,1.396,1.522,0.999,0.557,0.322,0.298
...,...,...,...,...,...,...,...,...,...
151,152,Rwanda,3.334,0.359,0.711,0.614,0.555,0.217,0.411
152,153,Tanzania,3.231,0.476,0.885,0.499,0.417,0.276,0.147
153,154,Afghanistan,3.203,0.350,0.517,0.361,0.000,0.158,0.025
154,155,Central African Republic,3.083,0.026,0.000,0.105,0.225,0.235,0.035


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156 entries, 0 to 155
Data columns (total 9 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Overall rank                  156 non-null    int64  
 1   Country or region             156 non-null    object 
 2   Score                         156 non-null    float64
 3   GDP per capita                156 non-null    float64
 4   Social support                156 non-null    float64
 5   Healthy life expectancy       156 non-null    float64
 6   Freedom to make life choices  156 non-null    float64
 7   Generosity                    156 non-null    float64
 8   Perceptions of corruption     156 non-null    float64
dtypes: float64(7), int64(1), object(1)
memory usage: 11.1+ KB


In [4]:
data['Country or region'].nunique()

156

In [6]:
le = LabelEncoder()
data['Country or region'] = le.fit_transform(data['Country or region'])

In [7]:
data.head(3)

Unnamed: 0,Overall rank,Country or region,Score,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption
0,1,43,7.769,1.34,1.587,0.986,0.596,0.153,0.393
1,2,36,7.6,1.383,1.573,0.996,0.592,0.252,0.41
2,3,105,7.554,1.488,1.582,1.028,0.603,0.271,0.341


In [8]:
data.drop(['Overall rank'],axis=1,inplace=True)

In [9]:
X = data.drop(['Score'],axis=1)
y = data['Score']

In [10]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [21]:
preprocessor = Pipeline([('imputer',SimpleImputer(strategy='median')),
                        ('Scale',StandardScaler())])

In [22]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor

In [None]:
preprocessor = Pipeline([
    ('imputer',SimpleImputer(strategy='mean')),
    ('scaler',StandardScaler())
])

In [None]:
DecisionTreeRegressor()

In [23]:
models = {'DT':(DecisionTreeRegressor(),
                {
                 'model__max_depth':[3,5,None]   
                }),
         'RF':(RandomForestRegressor(),
                {
                 'model__max_depth':[5,None],
                 'model__n_estimators':[100,200]
                }),
          'XGB':(XGBRegressor(),
                {
                 'model__max_depth':[3,5],
                 'model__n_estimators':[100,200]
                })
         }

In [24]:
best_model = None
best_score = 0

for name,(model,params) in models.items():
    pipe = Pipeline([
        ('prep',preprocessor),
        ('model',model)
    ])
    
    grid = GridSearchCV(pipe,param_grid=params,cv=5,scoring='r2')
    grid.fit(X_train,y_train)
    if grid.best_score_>best_score:
        best_score = grid.best_score_
        best_model = grid.best_estimator_

In [25]:
best_score

0.7588848836422505

In [26]:
best_model

In [27]:
feature_columns = X.columns.to_list()

In [28]:
joblib.dump(best_model,'World Happiness Report 2019.joblib')

['World Happiness Report 2019.joblib']

In [29]:
joblib.dump(feature_columns,'features_World Happiness Report 2019.joblib')

['features_World Happiness Report 2019.joblib']

In [None]:
import streamlit as st
import pandas as pd
import joblib

model = joblib.load('best_model.joblib')
features = joblib.load('features.joblib')
st.title('Employe promotion prediction')

user_input={}
for i in features:
    user_input[i] = st.number_input(i,value=0.0)

if st.button('Predict'):
    data = pd.DataFrame([user_input])
    pred = model.predict(data)[0]
    st.success(f'Prediction : {(Pred)}')