In [1]:
import numpy as np
import pandas as pd
import gensim
import nltk
import re
from sklearn.model_selection import KFold,cross_val_score,train_test_split,cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

df = pd.read_excel('C:/Users/alexa/OneDrive/Documenten/Thesis Lab/dataset_final.xlsx')

In [2]:
df.head(5)

Unnamed: 0,Index,Filename,Countrycode,Label,Sentence_translated,Sentence_original
0,0,011_DEU_2020-03-27c_2022-05-20_10:59:17.4.txt,DEU,NONE,Law on the Establishment of an Economic Stabil...,'Gesetz zur Errichtung eines Wirtschaftsstabil...
1,1,011_DEU_2020-03-27c_2022-05-20_10:59:17.4.txt,DEU,NONE,I p. 543 (no.,"'I S. 543 (Nr.',"
2,2,011_DEU_2020-03-27c_2022-05-20_10:59:17.4.txt,DEU,NONE,"14); Valid from March 28th, 2020 4 changes | P...",'14); Geltung ab 28.03.2020 4 Änderungen | Dr...
3,3,011_DEU_2020-03-27c_2022-05-20_10:59:17.4.txt,DEU,NONE,28th,"'28.',"
4,4,011_DEU_2020-03-27c_2022-05-20_10:59:17.4.txt,DEU,NONE,March 2020 StFG § 6- § 8- § 10- § 14e- § 15- §...,'März 2020 StFG § 6- § 8- § 10- § 14e- § 15- ...


In [3]:
# We'll write a function which will clean the text and prepare it.
def cleanText(text):
    cleaned = re.sub("[^a-zA-Z0-9']"," ",text)
    lowered = cleaned.lower()
    return lowered.strip()

cleanText("Let's test our function, by writing this string!")

"let's test our function  by writing this string"

In [4]:
x,y = np.asarray(df["Sentence_original"]),np.asarray(df["Label"])

x_cleaned = [cleanText(t) for t in x]
x_cleaned[:4]

["'gesetz zur errichtung eines wirtschaftsstabilisierungsfonds  wirtschaftsstabilisierungsfondsgesetz   wstfg  g  v  27 03 2020 bgbl '",
 "'i s  543  nr '",
 "'14   geltung ab 28 03 2020 4  nderungen   drucksachen   entwurf   begr ndung   wird in 8 vorschriften zitiert eingangsformel artikel 1  nderung des finanzmarktstabilisierungsfondsgesetzes artikel 2  nderung des finanzmarktstabilisierungsbeschleunigungsgesetzes artikel 3  nderung des kreditwesengesetzes artikel 4  nderung des wertpapierhandelsgesetzes artikel 5 inkrafttreten schlussformel  eingangsformel   der bundestag hat mit zustimmung des bundesrates das folgende gesetz beschlossen   inhaltsverzeichnis   ausdrucken pdf   nach oben  artikel 1  nderung des finanzmarktstabilisierungsfondsgesetzes  artikel 1 wird in 3 vorschriften zitiert und  ndert mwv '",
 "'28 '"]

In [5]:
# Also we should convert our categories to the integer labels
label_map = {cat:index for index,cat in enumerate(np.unique(y))}
y_prep = np.asarray([label_map[l] for l in y])

label_map

{'CONSUMER PROTECTION': 0,
 'EMPLOYMENT POLICY': 1,
 'FINANCIAL POLICY: DIRECT PAYMENTS': 2,
 'FINANCIAL POLICY: GUARANTEES': 3,
 'FINANCIAL POLICY: RESTRUCTURING OF LOAN TERMS': 4,
 'FISCAL POLICY': 5,
 'INDUSTRIAL POLICY': 6,
 'NONE': 7,
 'PRICE CONTROL': 8,
 'SOCIAL WELFARE POLICY': 9}

In [6]:
x_tokenized = [[w for w in sentence.split(" ") if w != ""] for sentence in x_cleaned]
x_tokenized[0]

["'gesetz",
 'zur',
 'errichtung',
 'eines',
 'wirtschaftsstabilisierungsfonds',
 'wirtschaftsstabilisierungsfondsgesetz',
 'wstfg',
 'g',
 'v',
 '27',
 '03',
 '2020',
 'bgbl',
 "'"]

In [7]:
for i in x_tokenized[0]:
    print(i)
    

'gesetz
zur
errichtung
eines
wirtschaftsstabilisierungsfonds
wirtschaftsstabilisierungsfondsgesetz
wstfg
g
v
27
03
2020
bgbl
'


In [8]:
# Now we'll create our model 
import time

start = time.time()

model = gensim.models.FastText(x_tokenized,
                 vector_size=600, sg=0, ns_exponent=1
                 # Size is the length of our vector.
                )

end = round(time.time()-start,2)
print("This process took",end,"seconds.")

This process took 57.88 seconds.


In [9]:
model.wv.most_similar("free")

[('ser', 0.9837781190872192),
 ('iscrivere', 0.982687771320343),
 ('svolgere', 0.9824087023735046),
 ('subi', 0.9770069122314453),
 ('percevoir', 0.9752346277236938),
 ('sul', 0.9749611616134644),
 ('sud', 0.9737043976783752),
 ('sua', 0.973050057888031),
 ('dazu', 0.9716665148735046),
 ('tercer', 0.9714767932891846)]

In [10]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(600,))
        
        return np.asarray(vec).flatten()
                
                
            

In [11]:
sequencer = Sequencer(all_words = [token for seq in x_tokenized for token in seq],
              max_words = 1500,
              seq_len = 15,
              embedding_matrix = model.wv
             )

In [12]:
test_vec = sequencer.textToVector("i am in love with you")
test_vec

array([-0.00224058,  0.02579685, -0.13051185, ...,  0.        ,
        0.        ,  0.        ])

In [13]:
type(test_vec)

numpy.ndarray

In [14]:
test_vec.shape

(9000,)

In [15]:
# But before creating a PCA model using scikit-learn let's create
# vectors for our each vector
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in x_tokenized])
print(x_vecs.shape)

(5450, 9000)


In [16]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=1090)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

Sum of variance ratios:  0.9999986645912589


In [17]:
x_comps = pca_model.transform(x_vecs)
x_comps.shape

(5450, 1090)

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x_comps,y_prep,test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(4360, 1090)
(1090, 1090)
(4360,)
(1090,)


In [19]:
start = time.time() 

svm_classifier = LinearSVC(dual=False)
svm_classifier.fit(x_train,y_train)

end = time.time()
process = round(end-start,2)
print("Support Vector Machine Classifier has fitted, this process took {} seconds".format(process))

Support Vector Machine Classifier has fitted, this process took 85.33 seconds


In [20]:
prediction = svm_classifier.predict(x_test)

In [21]:
svm_classifier.score(x_test,y_test)

0.7412844036697248

In [22]:
print("PREDICTION ON TEST")
print(classification_report(y_test, prediction))

PREDICTION ON TEST
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         9
           1       0.40      0.11      0.17        57
           2       0.45      0.17      0.25        81
           3       0.25      0.02      0.04        48
           4       0.00      0.00      0.00        20
           5       0.33      0.03      0.06        31
           6       0.00      0.00      0.00         4
           7       0.77      0.97      0.85       813
           8       0.00      0.00      0.00         6
           9       0.20      0.05      0.08        21

    accuracy                           0.74      1090
   macro avg       0.24      0.13      0.14      1090
weighted avg       0.65      0.74      0.67      1090



Note: precision is higher than recall, meaning 

In [32]:
from sklearn.model_selection import KFold,cross_val_score,train_test_split,cross_val_predict

from sklearn import metrics
scores = cross_val_score(svm_classifier, x_train, y_train, cv=10, scoring='accuracy')

In [33]:
print(scores)

[0.76146789 0.75       0.73623853 0.7293578  0.74311927 0.74541284
 0.75688073 0.75917431 0.76605505 0.75229358]


In [34]:
import statistics
average = statistics.mean(scores)
print(average)

0.75
