In [1]:
# universally modules
import re
import sys
import regex
import nltk.data
import numpy as np
import pandas as pd
from tqdm import tqdm
sys.path.append("../src")

# preprocessing and transformation modules
import fasttext
import Preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# model algorithm
from xgboost import XGBRegressor, XGBClassifier

# evaluation modules
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import classification_report



### Parameters

In [2]:
# to speed up the process choose a sample size to randomly draw a sample of the whole daataset
sample_size = 1000
# remove all text that contain less than n chars
min_chars_per_text = 50

# which features will be used for the TF-IDF transformation
text_features = "text_preprocessed"

#### define the target variable and categorial variables used in later transformations ###
#### Case 1: gender 
target_variable = "gender"
categorial_variables =  ["topic", "sign"]

# Case 2: topic
#target_variable = "topic"
#categorial_variables =  ["gender", "sign"]

# Case 3: age
#target_variable = "age"
#categorial_variables =  ["topic", "gender", "sign"]

# Case 4: sign
#target_variable = "sign"
#categorial_variables =  ["gender", "topic"]
############################################################################################

# use only words that occur at least sqrt_3(X) times 
min_df_exponent = (1/3)

### Initialization 

In [3]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
df = pd.read_csv("../resource/data/blogtext.csv")

In [4]:
# draw random sample for faster processing:
df = df.sample(sample_size)
df

Unnamed: 0,id,gender,age,topic,sign,date,text
141352,2440626,male,16,Student,Aries,"17,July,2004",So Britt S. and I are starting a tr...
95697,751202,female,33,Internet,Gemini,"09,August,2003",AS THE SONG GOES... urlLi...
298286,1596986,male,23,Biotech,Pisces,"17,May,2004",Aero replaces Luna in Longhorn. Doe...
447498,1409523,male,17,Student,Scorpio,"02,August,2004",wow..... tonight was interesting.... ye...
618858,4263464,male,27,Technology,Pisces,"21,August,2004",urlLink My daughter crawled f...
...,...,...,...,...,...,...,...
597067,2994939,male,37,Internet,Libra,"27,May,2004",HIGHLIGHTS INDIA – ECONOMY India g...
270572,3451304,female,25,Law,Libra,"11,June,2004",My Babies: Belle is on the left and Leo is ...
589506,3486663,female,13,Student,Taurus,"10,July,2004",I laugh Wierd If you notic...
423994,1955799,male,33,Education,Libra,"30,June,2004",Tonight and tomorrow I will work ...


### Filtering

In [5]:
# filter for a mininmal number of letters in a tweet:
df = df[df["text"].str.count(r"[a-zA-Z]") >= min_chars_per_text]
df = df.reset_index(drop=True)

### Feature Engineering

In [6]:
#def findDates(text):
#    try:
#        return len([date for date in\
#                    datefinder.find_dates(text)])
#    except:
#        return 0

In [7]:
def buildFeatures(text):
    text_split = text.split()
    len_text = len(text)
    sentence_split = tokenizer.tokenize(text)
    
    # find the number of urls in the text
    keywords = ["urlLink","http","www"]
    nb_urls = sum((any(keyword in pattern for keyword in keywords))\
               for pattern in text.split())
    # find the number of mails in the text
    nb_mails = len(re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+|\bmail\b)"\
                      ,text))
    
    # find the number of dates in the text
#    nb_dates = findDates(text)
     
    # find characteristics about the usage of letters, numbers and symbols
    uppercase_ratio = len(re.findall(r'[A-Z]', text))/len_text
    lowercase_ratio = len(re.findall(r'[a-z]', text))/len_text
    number_ratio = len(re.findall(r'[0-9]', text))/len_text
    symbol_ratio = len(re.findall(r'[$-/:-?{-~!"^_`\[\]]', text))/len_text

    # find characteristics about the letters per word
    sentence_len_word = [len(word) for word in text_split]
    avg_letters_per_word = np.mean([len(word) for word in text_split])
    var_letters_per_word = np.var([len(word) for word in text_split])
    unique_words_ratio = len(set(text_split))/len(text_split)

    # find characteristics about the letters per sentence
    sentence_len_list = [len(sentence) for sentence in sentence_split]
    avg_letters_per_sentence = np.mean(sentence_len_list)
    var_letters_per_sentence = np.var(sentence_len_list)
    
    # find characteristics about the words per sentence
    words_per_sentence_len_list = [len(sentence.split()) for sentence in sentence_split]
    avg_words_per_sentence = np.mean(words_per_sentence_len_list)
    var_words_per_sentence = np.var(words_per_sentence_len_list)
    
    # find the trumps
    uppercase_per_sentence_ratio = [len(re.findall(r'[A-Z]', sentence))/len(sentence)\
                                    for sentence in sentence_split]
    max_sentence_uppercase_ratio = max(uppercase_per_sentence_ratio)
    max_sentence_uppercase_len = len(sentence_split[uppercase_per_sentence_ratio.index(max_sentence_uppercase_ratio)])
    
    return len_text, nb_urls, nb_mails,\
           uppercase_ratio, lowercase_ratio, number_ratio, symbol_ratio,\
           avg_letters_per_word, var_letters_per_word, unique_words_ratio,\
           avg_letters_per_sentence, var_letters_per_sentence,\
           avg_words_per_sentence, var_words_per_sentence,\
           max_sentence_uppercase_ratio, max_sentence_uppercase_len        
           

In [8]:
# append the data
features = [buildFeatures(text) for text  in tqdm(df["text"])]

# append the data
columns = ["Text length", "Number URLs", "Number mails",\
          "Uppercase ratio", "Lowercase ratio", "Number ratio", "Symbol ratio",\
          "Average letters per word", "Variance of letters per word", "Unique words ratio",\
          "Average letters per sentence", "Variance of letters per sentence",\
          "Average words per sentence", "Variance of words per sentence",\
          "Maximal uppercase ratio per sentence", "Length of the maximal uppercase ratio sentence"]

# merge the features with the original dataset
df_preprocessed = df.merge(pd.DataFrame(features, columns=columns), left_index=True, right_index=True)

100%|██████████| 898/898 [00:01<00:00, 691.31it/s]


### Text Preprocessing

In [9]:
# use the preprocessing  module
preprocessor = Preprocessing.Preprocessing()
df_preprocessed["text_preprocessed"] = preprocessor.ProcessMany(df_preprocessed["text"])

# predict the main language
model = fasttext.load_model('../src/data/lid.176.ftz')
df_preprocessed["main_language"] = [model.predict(text)[0][0].split("__")[-1] for text in tqdm(df_preprocessed["text_preprocessed"])]

100%|██████████| 898/898 [00:27<00:00, 32.07it/s]
100%|██████████| 898/898 [00:00<00:00, 7072.18it/s]


In [10]:
def remove_non_ascii_words(text):
    return "".join([f"{word} " for word in text.split() if len(re.findall(r'[^\x00-\x7f]', word)) == 0])


In [11]:
# drop unnecassary features
df_filtered = df_preprocessed[(df_preprocessed["main_language"] == "en")]\
                .drop(["id","text","date","main_language"], axis= 1)

# remove all words with non-ascii chars
df_filtered["text_preprocessed"] = df_filtered["text_preprocessed"].apply(remove_non_ascii_words)

### Transformation

In [12]:
class StackedTransformation:
    
    def  __init__(self, X=None, y=None, numerical_transformer=None, text_transformer=None, text_features=None):
        self.X = X
        self.y = y
        self.numerical_transformer = numerical_transformer
        self.text_transformer = text_transformer
        self.text_features = text_features
        
    def data_split(self, X, y, test_size= 0.2):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size=test_size, random_state=42)
        return self.X_train, self.X_test, self.y_train, self.y_test
        
    def build_transformer(self, X=None, y=None,\
                          numerical_transformer=None, text_transformer=None, text_features=None):
        # check if variables are present
        if X == None:
            X = self.X
        if y == None:
            y = self.y
        if numerical_transformer == None:
            numerical_transformer = self.numerical_transformer
        if text_transformer == None:
            text_transformer = self.text_transformer
        if text_features == None:
            text_features = self.text_features

        # split the data
        
        self.data_split(X, y)
        
        # Create datasets for each classifier
        self.X_train_text = self.X_train[text_features]
        self.X_test_text =  self.X_test[text_features]
        
        self.X_train_numerical = self.X_train.drop(text_features, axis=1)
        self.X_test_numerical = self.X_test.drop(text_features, axis=1)

        # create transformers 
        self.numerical_transformer = numerical_transformer
        self.text_transformer = text_transformer
        
        #create transformed training batches
        self.X_train_numerical_transformed = self.numerical_transformer.fit_transform(self.X_train_numerical)
        self.X_test_numerical_transformed = self.numerical_transformer.transform(self.X_test_numerical)

        self.X_train_text_transformed = self.text_transformer.fit_transform(self.X_train_text)
        self.X_test_text_transformed = self.text_transformer.transform(self.X_test_text)

    def transform_one(self, x):
        entry = pd.DataFrame(columns=x.keys())
        entry = entry.append(x, ignore_index=True)
        
        # create datasets for each transformer
        entry_text = entry[text_features]
        entry_numerical = entry.drop(text_features, axis=1)
        
        entry_numerical_transformed = self.numerical_transformer.transform(entry_numerical)
        entry_text_transformed = self.text_transformer.transform(entry_text)
        
        return {"transformed_text": entry_text_transformed,\
               "transformed_numerical": entry_numerical_transformed}

    def transform_one(self, x):
        entry = pd.DataFrame(columns=x.keys())
        entry = entry.append(x, ignore_index=True)
        
        # create datasets for each transformer
        entry_text = entry[self.text_features]
        entry_numerical = entry.drop(self.text_features, axis=1)
        
        entry_numerical_transformed = self.numerical_transformer.transform(entry_numerical)
        entry_text_transformed = self.text_transformer.transform(entry_text)
        
        return {"transformed_text": entry_text_transformed,\
               "transformed_numerical": entry_numerical_transformed}


    def transform_many(self, X):
        # create datasets for each transformer
        X_text = X[self.text_features]
        X_numerical = X.drop(self.text_features, axis=1)
        
        X_numerical_transformed = self.numerical_transformer.transform(X_numerical)
        X_text_transformed = self.text_transformer.transform(X_text)
        
        return {"transformed_text": X_text_transformed,\
               "transformed_numerical": X_numerical_transformed}

### Data Split

In [13]:
X,y = df_filtered.drop(target_variable, axis=1),df_filtered[target_variable]

In [14]:
# use the  text transformer class to create two transformers for the textual and the numerical model
text_transformer = TfidfVectorizer(ngram_range=(1,1), min_df=int(len(X)**(min_df_exponent)))
numerical_transformer = make_column_transformer((OneHotEncoder(handle_unknown="ignore"), categorial_variables)\
                                                       , remainder=StandardScaler())

stacking = StackedTransformation(X, y, numerical_transformer, text_transformer, text_features)
stacking.build_transformer()

### Clustering

In [15]:
text_data_features = stacking.text_transformer.get_feature_names()
text_data = stacking.X_train_text_transformed.toarray()

df_text_cluster = pd.DataFrame(text_data, columns=text_data_features)
#df_text_cluster

# Die Features beschreiben die Worte im Text
# Die Werte sind die TF*IDF-transformierten Textdaten

In [16]:
try:
    numerical_data = stacking.X_train_numerical_transformed.toarray()
except:
    numerical_data = stacking.X_train_numerical_transformed

numerical_data_features = np.append(stacking.numerical_transformer.transformers_[0][1].get_feature_names(),\
                     stacking.X_train_numerical.columns.drop(categorial_variables))

df_numerical_cluster = pd.DataFrame(numerical_data, columns=numerical_data_features)
#df_numerical_cluster


# Die Features mit den x0 - xi Werten beschreiben die Ausprägungen die kategorialen Variablen
# das jeweilige i beschreibt das i-te Element der im Punkt "Target Variable" definierten liste categorial_variables
# Die verbleibenden Features (ohne xi) sind Standardskaliert, (x - \mu)/\sigma

### Training 

In [17]:
class BaggingModelling:
    
    
    def __init__(self,  numerical_model, numerical_model_params,\
                text_model, text_model_params, stacked_transformation_instance,\
                weights=(0.5, 0.5)):
        
        # Initialize numerical model
        self.numerical_model = numerical_model(**numerical_model_params)
        
        # Initialize text model
        self.text_model = text_model(**text_model_params)
        
        # internalize transformation class
        self.stacked_transformation = stacked_transformation_instance
        
        # model weights
        self.weights = weights
        
        
    def fit(self):
        # train numerical model
        self.numerical_model.fit(self.stacked_transformation.X_train_numerical_transformed,\
                             self.stacked_transformation.y_train)
                             
        print("Numerical model finished!")
        
        # train textual model
        self.text_model.fit(self.stacked_transformation.X_train_text_transformed,\
                             self.stacked_transformation.y_train)
                             
        print("Text model finished!")

        
    def predict_text(self, X):
        return self.text_model.predict(X)
    
    
    def predict_numerical(self, X):
        return self.numerical_model.predict(X)

    
    def optimize_weights(self, X, y, algo_type="classification"):
        X_transformed = self.stacked_transformation.transform_many(X)

        if algo_type == "classification":
            class_label_dict = dict([(k,v) for v,k in enumerate(self.numerical_model.classes_)])
            y_class_index = [class_label_dict[value] for value in y]
            
            # predict each row of X for each model
            y_pred_num = self.numerical_model.predict_proba(X_transformed["transformed_numerical"])
            y_pred_text = self.text_model.predict_proba(X_transformed["transformed_text"])
            
            # absolute loss for each model
            self.loss_numerical = sum([sum(np.delete(y_p, y_t)) for y_p, y_t in zip(y_pred_num,y_class_index)])
            self.loss_text = sum([sum(np.delete(y_p, y_t)) for y_p, y_t in zip(y_pred_text,y_class_index)])
            
            # optimize the weights based on their contribution to the loss
            # bewusst text und numerical vertauscht, damit die Gegenwahrscheinlichkeit verwendet wird.
            self.weights = np.array((self.loss_numerical, self.loss_text))/(self.loss_numerical +self.loss_text)
            

        if algo_type == "regression":
            # absolute loss for each model
            self.loss_numerical = np.absolute(self.numerical_model.predict(X_transformed["transformed_numerical"]) - y).sum()
            self.loss_text = np.absolute(self.text_model.predict(X_transformed["transformed_text"]) - y).sum()
            
            # optimize the weights based on their contribution to the loss
            # bewusst text und numerical vertauscht, damit die Gegenwahrscheinlichkeit verwendet wird.
            self.weights = np.array((self.loss_numerical, self.loss_text))/(self.loss_numerical + self.loss_text)
            
            
        print(f"""Weights have been optimized:
                Textual model weight: {self.weights[0]}
                Numerical model weight: {self.weights[1]}""")

    def weighted_prediction(self, X, weights=None, algo_type="classification"):
        if weights == None:
            weights = self.weights
        
        # check if one or more transactions become processed
        try:
            if type(X) == dict:
                X_transformed = self.stacked_transformation.transform_one(X)

            else:
                X_transformed = self.stacked_transformation.transform_many(X)
        except TypeError:
            print("Check the data type of the input data")

        if algo_type == "classification":
            predictions = (self.text_model.predict_proba(X_transformed["transformed_text"]),\
                            self.numerical_model.predict_proba(X_transformed["transformed_numerical"]))
            
            index = lambda x: np.argmax(x)
            
            self.weighted_predictions = (predictions[0]*weights[0] + predictions[1]*weights[1])
            classes_name = np.array([self.text_model.classes_[index(prediction)] for prediction in self.weighted_predictions])
            
            # get name of the classes
            return classes_name
        
        if algo_type == "regression":
            predictions = (self.text_model.predict(X_transformed["transformed_text"]),\
                            self.numerical_model.predict(X_transformed["transformed_numerical"]))
                        
            self.weighted_predictions = (predictions[0]*weights[0] + predictions[1]*weights[1])
            
            return self.weighted_predictions
        
        else:
            print("target variable type not supported")
            
    def create_report(self, X, y, algo_type="classification"):
        # fit weight optimization
        self.optimize_weights(X,y, algo_type)
        
        #get loss for bagged models:
        if algo_type == "classification":
            class_label_dict = dict([(k,v) for v,k in enumerate(self.numerical_model.classes_)])
            y_class_index = [class_label_dict[value] for value in y]
            
            # get loss for unoptimized weighting
            self.weighted_prediction(X, (0.5,0.5), algo_type="classification")
            y_pred = self.weighted_predictions
            loss_weights = sum([sum(np.delete(y_p, y_t)) for y_p, y_t in zip(y_pred,y_class_index)])
            
            # get loss for optimized weights
            self.weighted_prediction(X, algo_type="classification")
            y_pred = self.weighted_predictions
            loss_weights_optimized = sum([sum(np.delete(y_p, y_t)) for y_p, y_t in zip(y_pred,y_class_index)])
                        
        self.report = pd.Series()
        self.report["Absolute loss textual model"] = self.loss_text
        self.report["Absolute loss numerical model"] = self.loss_numerical
        self.report["Absolute loss equally weighted model"] = loss_weights
        self.report["Absolute loss optimized weights model"] = loss_weights_optimized

        return self.report

In [18]:
algo_type = "classification"
stacked_modelling = BaggingModelling(XGBClassifier, {}, XGBClassifier, {}, stacking, (0.5, 0.5))
X_test = stacked_modelling.stacked_transformation.X_test
y_test = stacked_modelling.stacked_transformation.y_test


stacked_modelling.fit()
stacked_modelling.optimize_weights(X_test, y_test, algo_type=algo_type)
stacked_modelling.create_report(X_test,y_test, algo_type)

Numerical model finished!
Text model finished!
Weights have been optimized:
                Textual model weight: 0.49170281196854165
                Numerical model weight: 0.5082971880314584
Weights have been optimized:
                Textual model weight: 0.49170281196854165
                Numerical model weight: 0.5082971880314584


  self.report = pd.Series()


Absolute loss textual model              84.142973
Absolute loss numerical model            81.395958
Absolute loss equally weighted model     82.769465
Absolute loss optimized weights model    82.746675
dtype: float64

### Parameter Tuning 

### Model Evaluation 