# Grant Classification
## Modelling

In [15]:
%matplotlib inline

In [31]:
import joblib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import spacy
import nltk

from sklearn.preprocessing import LabelEncoder 
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import naive_bayes, svm, linear_model
from sklearn.metrics import accuracy_score

import tensorflow as tf
import xgboost
from gensim.models import Word2Vec, KeyedVectors
from keras import Sequential, layers
from keras.backend import clear_session

plt.style.use('ggplot')
np.random.seed(359)  # reproducibility

#### Getting the data ready:
See data analysis notebook

In [3]:
de = spacy.load('de_core_news_sm')
stopwords = de.Defaults.stop_words

# Read data
data = pd.read_excel(io="../data/data2020.xlsx",
                     usecols=['Politikbereich', 'Zweck'])

# Clean data
umlauts = {'ä': 'ae', 'ö': 'oe', 'ü': 'ue'}
data['Zweck_clean'] = data['Zweck'].str.replace('[^\w\s]+', ' ', regex=True) \
                             .str.replace('\d+', ' ', regex=True) \
                             .str.replace(' +', ' ', regex=True) \
                             .replace(umlauts, regex=True) \
                             .str.lower() \
                             .apply(lambda x: [word.lemma_ for word in de(str(x))]) \
                             .apply(lambda x: [item for item in x if item not in stopwords]) \
                             .str.join(' ')

# Train/test split
data = data.groupby('Politikbereich').filter(lambda x: len(x) > 1)
train_X, test_X, train_y, test_y = train_test_split(data['Zweck_clean'], 
                                                    data['Politikbereich'], 
                                                    test_size=0.2,
                                                    stratify=data['Politikbereich'])

# Encoding y
encoder = LabelEncoder()
encoder = encoder.fit(data['Politikbereich'])
train_y_enc = encoder.transform(train_y)
test_y_enc = encoder.transform(test_y)

# Vectorising x
Tfidf_vect = TfidfVectorizer(max_features=500)  # TOD: play with that
Tfidf_vect.fit(data['Zweck_clean'])
train_X_tfidf = Tfidf_vect.transform(train_X)
test_X_tfidf = Tfidf_vect.transform(test_X)

### Modelling approaches

We are first going to test several simpler classification approaches - Generalized Linear Model, Naive Bayes, Support Vector Machines and XGBoost.

We are going to fit all of them without any tuning and then tune them as to maximize performance.

We are going to compare them on accuracy, f1 micro, area under ROC curve.

#### Logistic Regression, Naive Bayes and SVMs

We are going to use these models as a baseline for the more complex tuned models we are going to try to implement.

In [7]:
LG = linear_model.LogisticRegression(max_iter=200)
LG.fit(train_X_tfidf, train_y_enc)

predictions_LG = LG.predict(test_X_tfidf)

print(accuracy_score(predictions_LG, test_y_enc))

0.6308784383318545


In [8]:
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_X_tfidf, train_y_enc)

predictions_NB = Naive.predict(test_X_tfidf)

print(accuracy_score(predictions_NB, test_y_enc))

0.6069210292812778


In [9]:
SVM = svm.SVC(C=1, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_X_tfidf, train_y_enc)

predictions_SVM = SVM.predict(test_X_tfidf)

print(accuracy_score(predictions_SVM, test_y_enc))

0.6339840283939663


#### XGBoost

In [12]:
xg = xgboost.XGBClassifier(use_label_encoder=False, eval_metric="rmse")

xg.fit(train_X_tfidf, train_y_enc)
predictions_xg = xg.predict(test_X_tfidf)
print("No tuning:", accuracy_score(predictions_xg, test_y_enc))

No tuning: 0.6353149955634427


#### Tuning XGBoost using Grid Search
We choose the parameters we want to test. The size and number of trees are always a parameter of interest. We also want to add a few more to try to reduce overfitting - by not allowing splits if there are too many samples or by not using all data in a given boosting iteration for example.

We will furthermore try using the mean absolute error as an evaluation metric. A prediction is that there may be a lot of outliers (for example single words that aren't actually connected to the topic and are there because of other/random factors - such as names). The mean absolute error penalizes those outliers more (because it doesn't squre their error). As done in different variations of Robust Linear Regression, this might give a boost to the model and should be examined.

In [13]:
params = {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'min_child_weight': [1, 10],
        'gamma': [0.5, 1, 2],
        'subsample': [0.7, 1.0],
        'eval_metric': ["mae", 'rmse'],
        }

In [14]:
xg = xgboost.XGBClassifier(use_label_encoder=False)

skf = StratifiedKFold(n_splits=3, shuffle = True, random_state = 359)
grid = GridSearchCV(estimator=xg, param_grid=params, scoring='f1_micro', n_jobs=8, cv=skf.split(train_X_tfidf,train_y_enc))

grid.fit(train_X_tfidf, train_y_enc)
predictions_xg_tuned = grid.best_estimator_.predict(test_X_tfidf)
print("Tuned model accuracy:", accuracy_score(predictions_xg_tuned, test_y_enc))
print("Parameters used:", grid.best_params_)



Tuned model accuracy: 0.6353149955634427
Parameters used: {'eval_metric': 'mae', 'gamma': 0.5, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}


Parameters used: {'eval_metric': 'mae', 'gamma': 0.5, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}

#### Evaluation of Configuration

Selecting a baseline model

We are going to use the following baselines:
1. Always predicting the most populous class (ca 25% accuracy)
2. Using a generalized linear model (ca 60% accuracy)

#### UNFINISHED: CNN

We are going to train a convolutional neural network to try to beat the other approaches.

In [174]:
clear_session()

train_y_k = tf.keras.utils.to_categorical(train_y_enc)
test_y_k = tf.keras.utils.to_categorical(test_y_enc)

model = Sequential()
input_dim = train_X_tfidf.shape[1]

#model.add(layers.Embedding(input_dim=input_dim, 
#                           output_dim=200, 
#                           input_length=500))
#model.add(layers.Conv1D(128, 5, activation='relu'))
#model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dense(200, input_dim=input_dim, activation='relu'))
model.add(layers.Dense(29, activation='softmax'))
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 200)               100200    
                                                                 
 dense_1 (Dense)             (None, 29)                5829      
                                                                 
Total params: 106,029
Trainable params: 106,029
Non-trainable params: 0
_________________________________________________________________


In [175]:
clear_session()

model.fit(train_X_tfidf.toarray(), train_y_k,
          epochs=10,
          verbose=False,
          validation_data=(test_X_tfidf.toarray(), test_y_k),
          batch_size=100)
loss, accuracy = model.evaluate(train_X_tfidf.toarray(), train_y_k, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(test_X_tfidf.toarray(), test_y_k, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

Training Accuracy: 0.6989
Testing Accuracy:  0.6362
