In [None]:
# universally modules
import sys
sys.path.append("../src")
import numpy as np
import pandas as pd
from tqdm import tqdm

# preprocessing and transformation modules
import fasttext
import Preprocessing
from Features import buildFeatures
from Modelling import StackingModelling
from ModelSelection import ModelSelection, process_case
from Transformation import StackedTransformation, transformation

# Scikit-Learn
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.feature_extraction.text import TfidfVectorizer

# model algorithm
from sklearn.svm import LinearSVC, LinearSVR
from xgboost import XGBRegressor, XGBClassifier
from sklearn.linear_model import SGDClassifier, SGDRegressor

# evaluation modules
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Individual Modelling

This part is dependent of the cases and need to be done individual for each case

### Model Selection

In [None]:
# Split the data into  X and y
case_gender = {"target_type": "classification",
               "target_variable": "gender",
               "grid_search_metric": "accuracy",
               "text_features": "text_preprocessed",
               "categorial_variables": ["topic","sign"],
               "min_df_exponents" : [(1/4), (1/3), (1/2)],
               "n_gram_range" : [(1,1), (1,2), (2,2)],
               "use_tfidf": [True, False],
               "ml_algorithms_params": [(XGBClassifier,
                                   {'learning_rate': [0.1, 1, 1.5],
                                   'max_depth': [3, 6, 9],
                                   'n_estimators': [200, 600, 1200]}),
                                  
                                  (SGDClassifier,
                                   {"loss": ["hinge", "log", "modified_huber", "squared_hinge"],
                                   "penalty": ["l2", "l1", "elasticnet"],
                                   "alpha": [0.00001, 0.001]}),
                                  
                                  (LinearSVC,{"penalty": ["l1", "l2"],
                                   "loss": ["hinge", "squared_hinge"],
                                   "C": [0.8, 1, 1.2]})]
              }


case_topic = {"target_type": "classification",
               "target_variable": "topic",
               "grid_search_metric": "f1_weighted",
               "text_features": "text_preprocessed",
               "categorial_variables": ["gender","sign"],
               "min_df_exponents" : [(1/4), (1/3), (1/2)],
               "n_gram_range" : [(1,1), (1,2), (2,2)],
               "use_tfidf": [True, False],
               "ml_algorithms_params": [(XGBClassifier,
                                   {'learning_rate': [0.1, 1, 1.5],
                                   'max_depth': [3, 6, 9],
                                   'n_estimators': [200, 600, 1200]}),
                                  
                                  (SGDClassifier,
                                   {"loss": ["hinge", "log", "modified_huber", "squared_hinge"],
                                   "penalty": ["l2", "l1", "elasticnet"],
                                   "alpha": [0.01, 0.1, 1]}),
                                  
                                  (LinearSVC,{"penalty": ["l1", "l2"],
                                   "loss": ["hinge", "squared_hinge"],
                                   "C": [0.8, 1, 1.2]})]
              }



case_age = {"target_type": "regression",
               "target_variable": "age",
               "grid_search_metric": "neg_mean_squared_error",
               "text_features": "text_preprocessed",
               "categorial_variables": ["topic","gender","sign"],
               "min_df_exponents" : [(1/4), (1/3), (1/2)],
               "n_gram_range" : [(1,1), (1,2), (2,2)],
               "use_tfidf": [True, False],
               "ml_algorithms_params": [(XGBRegressor,
                                   {'learning_rate': [0.1, 1, 1.5],
                                   'max_depth': [3, 6, 9],
                                   'n_estimators': [200, 600, 1200]}),
                                  
                                  (SGDRegressor,
                                   {"loss": ["squared_loss", "huber"],
                                   "penalty": ["l2", "l1", "elasticnet"],
                                   "alpha": [0.01, 0.1, 1]}),
                                  
                                  (LinearSVR,
                                   {"loss": ["epsilon_insensitive", "squared_epsilon_insensitive"],
                                   "C": [0.8, 1, 1.2]})]
              }


case_sign = {"target_type": "classification",
               "target_variable": "sign",
               "grid_search_metric": "f1_weighted",
               "text_features": "text_preprocessed",
               "categorial_variables": ["gender","topic"],
               "min_df_exponents" : [(1/4), (1/3), (1/2)],
               "n_gram_range" : [(1,1), (1,2), (2,2)],
               "use_tfidf": [True, False],
               "ml_algorithms_params": [(XGBClassifier,
                                   {'learning_rate': [0.1, 1, 1.5],
                                   'max_depth': [3, 6, 9],
                                   'n_estimators': [200, 600, 1200]}),
                                  
                                  (SGDClassifier,
                                   {"loss": ["hinge", "log", "modified_huber", "squared_hinge"],
                                   "penalty": ["l2", "l1", "elasticnet"],
                                   "alpha": [0.01, 0.1, 1]}),
                                  
                                  (LinearSVC,{"penalty": ["l1", "l2"],
                                   "loss": ["hinge", "squared_hinge"],
                                   "C": [0.8, 1, 1.2]})]
              }


In [None]:
df_filtered = pd.read_pickle("./df_full_preprocessed.pkl")

for case in [case_gender, case_sign, case_age, case_topic]:
    df_results = process_case(transformation, case, df_filtered)
    df_results.to_pickle(f'./Model Selection/pd_df_cv_{case["target_variable"]}_{str(pd.Timestamp.now())}.pkl')