# Final Model

## Loading Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
pd.options.display.max_rows = 20
pd.options.display.max_columns = 81

In [2]:
file = "../raw_data/final.csv"

prepared_df = pd.read_csv(file, decimal=",")

prepared_df.head()

Unnamed: 0,digital_transformation,employee_engagement,employee_satisfaction,innovation,internationalization,market_competitiveness,people_management,people_structure,recruitment,training_and_development,work_processes,cluster,sector,region,company_size
0,0.1944444444444444,0.4242424242424243,0.4166666666666667,0.0,0.0,0.5833333333333334,0.4019607843137254,0.5555555555555556,0.4722222222222222,0.2592592592592592,0.3888888888888889,4,"manufacturing, processing & construction",western europe,small
1,0.2777777777777778,0.6515151515151515,0.7333333333333334,0.0,0.3333333333333333,0.6388888888888888,0.5588235294117646,0.5555555555555556,0.5000000000000001,0.6296296296296297,0.0555555555555555,3,"manufacturing, processing & construction",western europe,small
2,0.25,0.3030303030303031,0.4166666666666666,0.1875,0.3333333333333333,0.5,0.3431372549019608,0.6666666666666667,0.8333333333333335,0.6481481481481481,0.1666666666666666,3,"manufacturing, processing & construction",western europe,small
3,0.6666666666666666,0.787878787878788,0.9333333333333332,0.125,0.0,0.4166666666666667,0.7058823529411766,0.9222222222222222,0.8333333333333334,0.8888888888888888,0.3333333333333333,0,"manufacturing, processing & construction",western europe,small
4,0.4722222222222222,0.5606060606060607,0.6166666666666667,0.25,0.0,0.7777777777777778,0.5,0.5555555555555556,0.5,0.4444444444444444,0.4444444444444445,3,"manufacturing, processing & construction",western europe,small


In [3]:
prepared_df.shape

(21869, 15)

## Preparing X and y

In [4]:
X = prepared_df.drop(columns=['cluster', 'sector', 'region', 'company_size'])

y = prepared_df['cluster']

In [5]:
X.shape

(21869, 11)

In [6]:
X.dtypes

digital_transformation      object
employee_engagement         object
employee_satisfaction       object
innovation                  object
internationalization        object
market_competitiveness      object
people_management           object
people_structure            object
recruitment                 object
training_and_development    object
work_processes              object
dtype: object

### Converting String to Float

In [7]:
X = X.astype(float)
X.dtypes

digital_transformation      float64
employee_engagement         float64
employee_satisfaction       float64
innovation                  float64
internationalization        float64
market_competitiveness      float64
people_management           float64
people_structure            float64
recruitment                 float64
training_and_development    float64
work_processes              float64
dtype: object

### Train/Test Split

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Model: Stacking XGBoost, Gradient Boost and Gaussian

### Import necessary for model

In [10]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_validate

In [11]:
# instanciating different models

#Gradient Boosting
gradient_boost = GradientBoostingClassifier(
    n_estimators=100, 
    learning_rate=0.1
)

#XGBoost 
xgbc = XGBClassifier()

#Naive Bayes Gaussian
gaussian = GaussianNB()

In [12]:
#ensemble method with voting classified- stacking
ensemble = VotingClassifier(
    estimators = [("gradient_boost", gradient_boost),("xgbc", xgbc), ("gaussian", gaussian)],
    voting = 'soft', # to use predict_proba of each classifier before voting
    weights = [2,1,2] # to equally weight forest and logreg in the vote
)

In [13]:
ensemble.fit(X_train, y_train)
ensemble.score(X_train,y_train)





0.7414118319519862

In [14]:
ensemble_results = cross_validate(ensemble, X_train, y_train, cv=3)













In [15]:
print("CV mean results: ", ensemble_results['test_score'].mean())

CV mean results:  0.6804227471901948


In [16]:
    X = pd.DataFrame(dict(
        digital_transformation=[float(1)],
        employee_engagement=[float(1)],
        employee_satisfaction=[float(1)],
        innovation=[float(1)],
        internationalization=[float(1)],
        market_competitiveness=[float(1)],
        people_management=[float(1)],
        people_structure=[float(1)],
        recruitment=[float(1)],
        training_and_development=[float(1)],
        work_processes=[float(1)],

    ))

In [17]:
X

Unnamed: 0,digital_transformation,employee_engagement,employee_satisfaction,innovation,internationalization,market_competitiveness,people_management,people_structure,recruitment,training_and_development,work_processes
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [18]:
ensemble

VotingClassifier(estimators=[('gradient_boost',
                              GradientBoostingClassifier(ccp_alpha=0.0,
                                                         criterion='friedman_mse',
                                                         init=None,
                                                         learning_rate=0.1,
                                                         loss='deviance',
                                                         max_depth=3,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                  

In [19]:
ensemble.predict(X)

array([0])

In [20]:
import joblib

In [27]:
# pipeline = get_model_from_gcp()
pipeline = joblib.load('../model_le.joblib')

# make prediction
results = pipeline.predict(X)

# convert response from numpy to python type
pred = float(results[0])

dict(prediction=pred)


{'prediction': 0.0}

In [29]:
pipeline.predict(X_test)

array([2., 2., 2., ..., 0., 3., 3.])

In [30]:
X_test.head()

Unnamed: 0,digital_transformation,employee_engagement,employee_satisfaction,innovation,internationalization,market_competitiveness,people_management,people_structure,recruitment,training_and_development,work_processes
17067,0.5,0.363636,0.75,0.125,0.0,0.638889,0.431373,0.666667,0.652778,0.685185,0.222222
11723,0.5,0.727273,0.733333,0.0,0.666667,0.583333,0.529412,0.622222,0.625,0.611111,0.111111
13764,0.333333,0.560606,0.683333,0.0,0.333333,0.416667,0.470588,0.655556,0.75,0.796296,0.388889
12048,0.083333,0.818182,0.75,0.0,0.0,0.527778,0.392157,0.544444,0.638889,0.481481,0.333333
21040,0.194444,0.606061,0.616667,0.0,0.0,0.5,0.539216,0.788889,0.291667,0.592593,0.444444


In [31]:
X

Unnamed: 0,digital_transformation,employee_engagement,employee_satisfaction,innovation,internationalization,market_competitiveness,people_management,people_structure,recruitment,training_and_development,work_processes
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
