In [3]:
import pandas as pd
import sys
sys.path.append("../src")
from RequestMapper import RequestMapper

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score



In [4]:
pipelines = pd.read_pickle("./Pipelines/ModelPipelines.pkl")


## Evaluation of the Age-Regressor 

In [23]:
pipeline_age = pipelines.values[0]

X_test = pipeline_age.transformation.X_test
X_test_numerical_transformed = pipeline_age.transformation.X_test_numerical_transformed
X_test_text_transformed = pipeline_age.transformation.X_test_text_transformed
y_test = pipeline_age.transformation.y_test

In [28]:
y_pred_num = pipeline_age.modelling.numerical_model.predict(X_test_numerical_transformed)
y_pred_text = pipeline_age.modelling.text_model.predict(X_test_text_transformed)
y_pred_stacking = pipeline_age.modelling.weighted_prediction(X_test, None, "regression")

In [45]:
def create_metric(y_pred_type, y_pred, y_test):
    print(f"Report for {y_pred_type} Dataset \n\n")
    print("Correlation matrix:")
    df_numerical_eval = pd.DataFrame({f"{y_pred_type} predictions":y_pred, "Actual y":y_test})
    print(df_numerical_eval.corr())
    print(f"\n\nMean absolute error: {mean_absolute_error(y_test, y_pred)}")
    print(f"R2-Score: {r2_score(y_test, y_pred)}")

In [46]:
create_metric("Numerical", y_pred_num, y_test)

Report for Numerical Dataset 


Correlation matrix:
                       Numerical predictions  Actual y
Numerical predictions               1.000000  0.644813
Actual y                            0.644813  1.000000


Mean absolute error: 4.48505021273046
R2-Score: 0.3944441505103057


In [47]:
create_metric("Text", y_pred_text, y_test)

Report for Text Dataset 


Correlation matrix:
                  Text predictions  Actual y
Text predictions          1.000000  0.616172
Actual y                  0.616172  1.000000


Mean absolute error: 4.64777559344527
R2-Score: 0.37890069088790757


In [48]:
create_metric("stacking", y_pred_stacking, y_test)

Report for stacking Dataset 


Correlation matrix:
                      stacking predictions  Actual y
stacking predictions              1.000000  0.727285
Actual y                          0.727285  1.000000


Mean absolute error: 4.046404894183239
R2-Score: 0.5184081231923705


### Evaluation of the Gender-Classifier 

In [53]:
pipeline_gender = pipelines.values[1]

X_test = pipeline_gender.transformation.X_test
X_test_numerical_transformed = pipeline_gender.transformation.X_test_numerical_transformed
X_test_text_transformed = pipeline_gender.transformation.X_test_text_transformed
y_test = pipeline_gender.transformation.y_test

In [57]:
y_pred_num = pipeline_gender.modelling.numerical_model.predict(X_test_numerical_transformed)
y_pred_text = pipeline_gender.modelling.text_model.predict(X_test_text_transformed)
y_pred_stacking = pipeline_gender.modelling.weighted_prediction(X_test, None)

In [76]:
def create_metric(y_pred_type, y_pred, y_test):
    print(f"Report for {y_pred_type} Dataset \n\n")
    print("Confusion matrix:")
    print(confusion_matrix(y_test, y_pred))
    print(f"\n\nAccuracy: {accuracy_score(y_test, y_pred)}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

In [77]:
 create_metric("Numerical", y_pred_num, y_test)

Report for Numerical Dataset 


Confusion matrix:
[[47019 12481]
 [14954 46036]]


Accuracy: 0.772304755581376
F1-Score: 0.7722666263111982


In [78]:
create_metric("Text", y_pred_text, y_test)

Report for Text Dataset 


Confusion matrix:
[[40764 18736]
 [17134 43856]]


Accuracy: 0.7022989459706199
F1-Score: 0.702197305550473


In [79]:
create_metric("Weighted", y_pred_stacking, y_test)

Report for Weighted Dataset 


Confusion matrix:
[[48385 11115]
 [12611 48379]]


Accuracy: 0.8030873931446593
F1-Score: 0.8030872713987556


## Evaluation of the Sign-Classifier 

In [81]:
pipeline_sign = pipelines.values[2]

X_test = pipeline_sign.transformation.X_test
X_test_numerical_transformed = pipeline_sign.transformation.X_test_numerical_transformed
X_test_text_transformed = pipeline_sign.transformation.X_test_text_transformed
y_test = pipeline_sign.transformation.y_test

In [82]:
y_pred_num = pipeline_sign.modelling.numerical_model.predict(X_test_numerical_transformed)
y_pred_text = pipeline_sign.modelling.text_model.predict(X_test_text_transformed)
y_pred_stacking = pipeline_sign.modelling.weighted_prediction(X_test, None)

In [111]:
def create_metric(y_pred_type, y_pred, y_test):
    print(f"Report for {y_pred_type} Dataset \n\n")
    
    print("Relative Share of of each classes:")
    print(y_test.value_counts()/y_test.size)
    print("The values above can be used to compare the model performance against with the most expected class")
    
    print(f"\n\nAccuracy: {accuracy_score(y_test, y_pred)}")
    print(f"F1-Score: {f1_score(y_test, y_pred, average='weighted')}")

In [112]:
create_metric("Numerical", y_pred_num, y_test)

Report for Numerical Dataset 


Relative Share of of each classes:
Cancer         0.096730
Aries          0.093742
Libra          0.092041
Taurus         0.090315
Virgo          0.088613
Scorpio        0.083468
Pisces         0.080712
Leo            0.078637
Gemini         0.075749
Aquarius       0.073749
Sagittarius    0.073691
Capricorn      0.072554
Name: sign, dtype: float64
The values above can be used to compare the model performance against with the most expected class


Accuracy: 0.39932774504108226
F1-Score: 0.40019804921862945


In [113]:
create_metric("Text", y_pred_text, y_test)

Report for Text Dataset 


Relative Share of of each classes:
Cancer         0.096730
Aries          0.093742
Libra          0.092041
Taurus         0.090315
Virgo          0.088613
Scorpio        0.083468
Pisces         0.080712
Leo            0.078637
Gemini         0.075749
Aquarius       0.073749
Sagittarius    0.073691
Capricorn      0.072554
Name: sign, dtype: float64
The values above can be used to compare the model performance against with the most expected class


Accuracy: 0.20647356627105984
F1-Score: 0.19900904971747324


In [114]:
create_metric("Stacking", y_pred_stacking, y_test)

Report for Stacking Dataset 


Relative Share of of each classes:
Cancer         0.096730
Aries          0.093742
Libra          0.092041
Taurus         0.090315
Virgo          0.088613
Scorpio        0.083468
Pisces         0.080712
Leo            0.078637
Gemini         0.075749
Aquarius       0.073749
Sagittarius    0.073691
Capricorn      0.072554
Name: sign, dtype: float64
The values above can be used to compare the model performance against with the most expected class


Accuracy: 0.4213544692505602
F1-Score: 0.4223261341341247


## Evaluation of the Topic-Classifier 

In [116]:
pipeline_topic = pipelines.values[3]

X_test = pipeline_topic.transformation.X_test
X_test_numerical_transformed = pipeline_topic.transformation.X_test_numerical_transformed
X_test_text_transformed = pipeline_topic.transformation.X_test_text_transformed
y_test = pipeline_topic.transformation.y_test

In [117]:
y_pred_num = pipeline_topic.modelling.numerical_model.predict(X_test_numerical_transformed)
y_pred_text = pipeline_topic.modelling.text_model.predict(X_test_text_transformed)
y_pred_stacking = pipeline_topic.modelling.weighted_prediction(X_test, None)

In [118]:
create_metric("Numerical", y_pred_num, y_test)

Report for Numerical Dataset 


Relative Share of of each classes:
indUnk                     0.369881
Student                    0.224865
Technology                 0.061059
Arts                       0.047788
Education                  0.044020
Communications-Media       0.030218
Internet                   0.022450
Non-Profit                 0.021413
Engineering                0.017064
Law                        0.013860
Publishing                 0.011594
Science                    0.011312
Government                 0.010250
Religion                   0.008382
Consulting                 0.008333
Fashion                    0.007013
Marketing                  0.006897
Advertising                0.006855
BusinessServices           0.006449
Banking                    0.005735
Accounting                 0.005735
Chemicals                  0.005212
Telecommunications         0.004905
Military                   0.004805
Museums-Libraries          0.004681
Sports-Recreation          0.0045

In [119]:
create_metric("Text", y_pred_text, y_test)

Report for Text Dataset 


Relative Share of of each classes:
indUnk                     0.369881
Student                    0.224865
Technology                 0.061059
Arts                       0.047788
Education                  0.044020
Communications-Media       0.030218
Internet                   0.022450
Non-Profit                 0.021413
Engineering                0.017064
Law                        0.013860
Publishing                 0.011594
Science                    0.011312
Government                 0.010250
Religion                   0.008382
Consulting                 0.008333
Fashion                    0.007013
Marketing                  0.006897
Advertising                0.006855
BusinessServices           0.006449
Banking                    0.005735
Accounting                 0.005735
Chemicals                  0.005212
Telecommunications         0.004905
Military                   0.004805
Museums-Libraries          0.004681
Sports-Recreation          0.004590
Re

In [120]:
create_metric("Stacking", y_pred_stacking, y_test)

Report for Stacking Dataset 


Relative Share of of each classes:
indUnk                     0.369881
Student                    0.224865
Technology                 0.061059
Arts                       0.047788
Education                  0.044020
Communications-Media       0.030218
Internet                   0.022450
Non-Profit                 0.021413
Engineering                0.017064
Law                        0.013860
Publishing                 0.011594
Science                    0.011312
Government                 0.010250
Religion                   0.008382
Consulting                 0.008333
Fashion                    0.007013
Marketing                  0.006897
Advertising                0.006855
BusinessServices           0.006449
Banking                    0.005735
Accounting                 0.005735
Chemicals                  0.005212
Telecommunications         0.004905
Military                   0.004805
Museums-Libraries          0.004681
Sports-Recreation          0.00459