In [1]:
# universally modules
import sys
sys.path.append("../src")
import numpy as np
import pandas as pd

# preprocessing and transformation modules
import fasttext
import Preprocessing
from Features import buildFeatures
from Modelling import StackingModelling
from ModelSelection import ModelSelection, process_case
from Transformation import StackedTransformation, transformation

# model algorithm
from sklearn.svm import LinearSVC, LinearSVR
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import SGDClassifier, SGDRegressor




In [2]:
# read datasets:
df_full_preprocessed = pd.read_pickle("./df_full_preprocessed.pkl")
df_full_preprocessed = df_full_preprocessed.sample(1_000)
df_eval_results_age =  pd.read_pickle("./Model Selection/pd_df_cv_age_2020-12-27 09:47:05.199850.pkl")
df_eval_results_sign =  pd.read_pickle("./Model Selection/pd_df_cv_sign_2020-12-27 07:12:32.465884.pkl")
df_eval_results_topic =  pd.read_pickle("./Model Selection/pd_df_cv_topic_2021-01-04 19:33:32.751170.pkl")
df_eval_results_gender =  pd.read_pickle("./Model Selection/pd_df_cv_gender_2020-12-25 00:31:53.679084.pkl")

## Pipeline erstellen

In [3]:
# Bestimmen der besten Lernalgorithmen und Optimierungsparameter für die jeweilige Zielvariable
# und bauen der daraus resultierenden Pipeline

class BuildPipeline:
    
    def __init__(self, df_eval_results_variable, df_preprocessed, algo_type):
        self.df_eval_results_variable = df_eval_results_variable
        self.target_variable = df_eval_results_age["target_variable"].values[0]
        self.categorial_variables = df_eval_results_age["categorial_variables"].values[0]
        self.X = df_preprocessed.drop(self.target_variable, axis=1)
        self.y = df_preprocessed[self.target_variable]

        self.algo_type = algo_type
        
    def best_model_and_params(self):
        # get best textual_model
        self.best_text_model = self.df_eval_results_variable[self.df_eval_results_variable["CV_text_best_score"] ==\
                                              self.df_eval_results_variable["CV_text_best_score"].max()]

        self.best_text_algo = self.best_text_model["ml_algorithms_params"].values[0][0]
        self.best_text_params = self.best_text_model["CV_text_best_params"].values[0]

        # get best numerical model
        self.best_numerical_model = self.df_eval_results_variable[self.df_eval_results_variable["CV_numerical_best_score"]\
                                                  == self.df_eval_results_variable["CV_numerical_best_score"].max()]

        self.best_numerical_algo = self.best_numerical_model["ml_algorithms_params"].values[0][0]
        self.best_numerical_params = self.best_numerical_model["CV_text_best_params"].values[0]
    
    def build_transformation(self):
        min_df_exponent = self.best_text_model["min_df_exponents"].values[0]
        ngram_range = self.best_text_model["n_gram_range"].values[0]
        text_features = self.best_text_model["text_features"].values[0]
        use_idf = self.best_text_model["use_tfidf"].values[0]
        self.transformation = transformation(self.X, self.y, self.target_variable, self.categorial_variables,\
                                             min_df_exponent, ngram_range, text_features, use_idf)
    
    def build_model(self):
        self.modelling = StackingModelling(self.best_numerical_algo, self.best_numerical_params,\
                                     self.best_text_algo, self.best_text_params, self.transformation)
        
        self.modelling.fit()
        print(self.modelling.create_report(self.transformation.X_test, self.transformation.y_test, self.algo_type))
    
    
    def fit(self):
        self.best_model_and_params()
        self.build_transformation()
        self.build_model()
        
    def predict(self, X):
        return self.modelling.weighted_prediction(X, algo_type=self.algo_type)
# Erstellen der Transformation für text und numerischen Datensatz

In [13]:
age_model = BuildPipeline(df_eval_results_age, df_full_preprocessed, "regression")
age_model.fit()

gender_model = BuildPipeline(df_eval_results_gender, df_full_preprocessed, "classification")
gender_model.fit()

sign_model = BuildPipeline(df_eval_results_sign, df_full_preprocessed, "classification")
sign_model.fit()

topic_model = BuildPipeline(df_eval_results_topic, df_full_preprocessed, "classification")
topic_model.fit()

Numerical model finished!
Text model finished!
Weights have been optimized:
                Textual model weight: 0.5637359655357802
                Numerical model weight: 0.43626403446421974
Absolute loss textual model              1202.463787
Absolute loss numerical model            1553.811524
Absolute loss equally weighted model     1218.612798
Absolute loss optimized weights model    1200.910874
dtype: float64
Numerical model finished!
Text model finished!
Weights have been optimized:
                Textual model weight: 0.4858801731307771
                Numerical model weight: 0.5141198268692229
Absolute loss textual model              182.683116
Absolute loss numerical model            172.648670
Absolute loss equally weighted model     177.665893
Absolute loss optimized weights model    177.524204
dtype: float64
Numerical model finished!
Text model finished!
Weights have been optimized:
                Textual model weight: 0.48500633194736814
                Numerical model

In [16]:
# save models to series
model_pipelines = pd.Series([age_model, gender_model, sign_model, topic_model])

In [18]:
model_pipelines.to_pickle("./Pipelines/ModelPipelines.pkl")