# Libraries used in the Experimentation

1. PyTorch

2. Pandas
3. Scikit-Learn
4. NLTK
5. Numpy
6. Seaborn
7. NLTK
8. BERTopic
9. SHAP
10. Spacy
11. Unidecode

**Text Cleaning**

In [None]:
sw = stopwords.words('english')

def clean_text(text):
    # remove
    text = re.sub('\[.*?\]', '', text)
    # remove links
    text = re.sub('https?://\S+|www\.\S+', '', text)
    # remove tags
    text = re.sub('<.*?>+', '', text)
    # remove punctuation
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    # remove breaklines
    text = re.sub('\n', '', text)
    # remove numbers
    text = re.sub('\w*\d\w*', '', text)

    # remove accent
    text = unidecode.unidecode(text)

    # transform text into token
    text_token = nltk.word_tokenize(text)

    # remove stopwords
    words = [w for w in text_token if w not in sw]

    return ' '.join(words)

**Lemmatization**

In [None]:
### Text Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_sentence(text):
    # transform text into token
    text_token = nltk.word_tokenize(text)
    lemmatized_sentence = []
    for word in text_token:
        lemmatized_sentence.append(word)
    return " ".join(lemmatized_sentence)

**BERTopic**

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')
wn = nltk.WordNetLemmatizer()
# Topic model
from bertopic import BERTopic
# Dimension reduction
from umap import UMAP

In [None]:
# Initiate UMAP
umap_model = UMAP(n_neighbors=6,
                  n_components=3,
                  min_dist=0.2,
                  metric='cosine',
                  random_state=42)
# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="multilingual", calculate_probabilities=True)
# Run BERTopic model
topics, probabilities = topic_model.fit_transform(data)
# Visualize top topic keywords/?
topic_model.visualize_barchart(top_n_topics=8)

In [None]:
# Visualize term rank decrease
topic_model.visualize_term_rank()

In [None]:
# Visualize connections between topics using hierachical clustering
topic_model.visualize_hierarchy(top_n_topics=12)

**SHAP**

In [None]:
shap.initjs()

In [None]:
explainer = shap.KernelExplainer(model.predict_proba, X_train)
shap_values = explainer.shap_values(X_test)

In [None]:
explainer = shap.KernelExplainer(model.predict_proba, X_train)
shap_values = explainer.shap_values(X_test)

In [None]:
explainer = shap.sample(X_train,50)
shap_values = explainer.shap_values(X_test)

**Principal Component Analysis (PCA)**

In [None]:
#PCA to reduce input features
transformer = PCA(n_components = 0.99)
transformer.fit(X_train)
pca_cols = []
for i in range(transformer.n_components_):
    pca_cols.append("pc"+str(i))
X_train = pd.DataFrame(transformer.transform(X_train),
                 columns=pca_cols)
X_test = pd.DataFrame(transformer.transform(X_test),
                 columns=pca_cols)

**SMOTE Implementation**

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=2)
X_train, y_train = oversample.fit_resample(X_train, y_train)

**Mutual Information**

In [None]:
#feature selection using Mutual Info Gain
mutual_info = mutual_info_classif(X_train, y_train, random_state= 42)
mutual_info = dict(zip(X_train.columns, mutual_info))
# Plotthing the ordered mutual_info values per feature
bar_ind = pd.Series(mutual_info)
bar_ind = bar_ind.sort_values(ascending=False)
info_gain_list = []
for x in bar_ind.keys():
    if bar_ind[x]>0.1:
        info_gain_list.append(x)

In [None]:
#let's plot the ordered mutual_info values per feature
bar_ind = pd.Series(mutual_info)
bar_ind.sort_values(ascending=False).plot.bar(figsize=(20, 8))

#**Model Parameters**

**LSTM**

In [None]:
max_words = 500
max_len = 50

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer

In [None]:
X, tokenizer = tokenize_pad_sequences(data)
X_trn, X_tst, y_trn, y_tst = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_trn, X_vld, y_trn, y_vld = train_test_split(X_trn, y_trn, test_size=0.3, random_state=42, stratify=y_trn)


In [None]:
vocab_size = 5000
embedding_size = 32
epochs=8
max_len = 512

model= Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_len))
model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.1))
model.add(Dense(4, activation='sigmoid'))
model.build((None, 100))

plot_model(model, show_shapes = True)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy',tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
print(model.summary())

In [None]:
es = EarlyStopping(monitor = 'val_loss', patience=5)
batch_size = 64

history = model.fit(X_trn, y_trn,
                    validation_data=(X_vld, y_vld),
                    batch_size=batch_size, epochs=6, verbose=1,
                    callbacks = [es])

**XLNET**

In [None]:
# Create sentence and label lists
sentences = data.text_lm.values

sentences = [sentence + " [SEP] [CLS]" for sentence in sentences]
labels = data.Labels.values

In [None]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels,
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
# Set the maximum sequence length. The longest sequence in our training set is 47, but we'll leave room on the end anyway.
MAX_LEN = 128
# Use the XLNet tokenizer to convert the tokens to their index numbers in the XLNet vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
import torch

from transformers import XLNetForSequenceClassification

model = XLNetForSequenceClassification.from_pretrained('xlnet-base-cased', num_labels=2)

model.cuda()

param_optimizer = list(model.named_parameters())
no_decay = ['bias','gamma','beta']
optimizer_grouped_parameters = [
    {'params':[p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate':0.01},
    {'params':[p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate':0.0}
]

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):


  # Training

  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    loss = outputs[0]
    logits = outputs[1]
    train_loss_set.append(loss.item())
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()


    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))


**CatBoost**

In [None]:
from catboost import Pool
train_pool = Pool(X_train,y_train)
validate_pool = Pool(X_test,y_test)
params = {
    'leaf_estimation_method': 'Gradient',
    'learning_rate': 1,
    'max_depth': 3,
    'bootstrap_type': 'Bernoulli',
    'objective': 'MultiClass',


    'subsample': 0.8,
    'random_state': 42,
    'verbose': 0,
    "eval_metric" : 'TotalF1',
    "early_stopping_rounds" : 100
    }


model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

In [None]:
# CatBoost with GridSearchCV

parameters = {'depth'         : [4,5,6,7],
                 'learning_rate' : [0.01,0.04],
                  'iterations'    : [10, 20,30],
              'verbose': [False]
                 }
CBC = CatBoostClassifier()
Grid_CBC = GridSearchCV(estimator=CBC, param_grid = parameters)
Grid_CBC.fit(X_train, y_train)
cbc_best_params = Grid_CBC.best_params_
cbc_best_params

**SVM**

In [None]:
SVM = SVC(probability=True)
param={'kernel':('linear', 'poly','rbf'),
      'C':np.arange(1,5),
      'gamma': [0.1, 1],
      'degree':[2,4]}

SVM_grid = GridSearchCV(estimator = SVM,
                       param_grid = param)

**Random Forest with GridsearchCV**

In [None]:
# Number of trees in random forest
n_estimators = [20,60,100,120]

# Number of features to consider at every split
max_features = [0.2,0.6,1.0]

# Maximum number of levels in tree
max_depth = [2,8,None]

# Number of samples
max_samples = [0.5,0.75,1.0]
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
              'max_samples':max_samples
             }

In [None]:
RF = RandomForestClassifier()
rf_grid = GridSearchCV(estimator = RF,
                       param_grid = param_grid,
                       n_jobs = -1)

In [None]:
rf_grid.fit(X_train,y_train)
rf_best_params = rf_grid.best_params_
rf_best_params

**Multi Layer Perceptron (MLP) with GridsearchCV**

In [None]:
params_mlp = {
    'hidden_layer_sizes': [(10,20,30)],
    'activation': ['tanh','relu','sigmoid'],
    'solver': ['adam'],
    'alpha': [0.0001],
    'learning_rate': ['constant', 'adaptive'],
    'random_state': [42]
}
mlp = GridSearchCV(MLPClassifier(max_iter=1000), params_mlp, n_jobs=-1).fit(X_train, y_train)
mlp_best_params = mlp.best_params_
mlp_best_params

**Pipeline for Code**

In [None]:
classifiers = [RandomForestClassifier(**rf_best_params),
              SVC(**svm_best_params),
               CatBoostClassifier(**cbc_best_params),
              MLPClassifier(**mlp_best_params)]
scoring = {'accuracy' : make_scorer(accuracy_score),
           'precision' : make_scorer(precision_score,pos_label='positive',average='macro'),
           'recall' : make_scorer(recall_score,pos_label='positive',average='macro'),
           'f1_score' : make_scorer(f1_score,pos_label='positive',average='macro')}
all_scores = []
f1_scores = []
accuracy_scores = []

precision_scores = []
recall_scores = []
for clf in classifiers:
    clf_pipe = Pipeline([('scale',MinMaxScaler()),('smt', SMOTE(random_state=43)), ('pca', PCA(n_components=0.99)),
                              ('cls', clf)] )
    scores = cross_validate(clf_pipe, X, Y, cv=StratifiedKFold(n_splits=10),scoring=scoring)
    mean_scores = {}
    all_scores.append(scores)
    for x in scores.keys():
        mean_scores[x] = scores[x].mean()
    f1_scores.append(mean_scores['test_f1_score'])
    accuracy_scores.append(mean_scores['test_accuracy'])
    precision_scores.append(mean_scores['test_precision'])
    recall_scores.append(mean_scores['test_recall'])
    print(type(clf).__name__,mean_scores)