## Morphological Classication using an SVM

**Loading files**

In [2]:
es_v = [line.strip().split() for line in open('es_v.txt',encoding = 'utf-8')]
fi_na = [line.strip().split() for line in open('fi_na.txt',encoding = 'utf-8')]
fi_v = [line.strip().split() for line in open('fi_v.txt',encoding = 'utf-8')]
de_v = [line.strip().split() for line in open('de_v.txt',encoding = 'utf-8')]
de_n = [line.strip().split() for line in open('de_n.txt',encoding = 'utf-8')]

### **Importing modules**

In [8]:
from sklearn.feature_extraction import DictVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from hyphen import Hyphenator

**Conjugation-Declension classifier function**

In [17]:
def conj_decl_classifier(inp,name):    
    X_train, X_test = train_test_split(inp, test_size=0.1, random_state=1) #splitting data into train and test   
    
    featuresonly = [l[1] for l in X_train] #taking words to convert into feature representations
    
    language = ''
    if name.split()[0] in ['Spanish']:
        language = 'es_ES'
    if name.split()[0] in ['German']:
        language = 'de_DE'
    if language != '':
        h = Hyphenator(language) #for splitting words into syllables in featurizer function
    
    def featurizer(featuresonly): #for creating feature dictionary
        features = []
        for l in featuresonly:
            feature_dictionary = {}
            i = 1
            while i != len(l): #adding prefixes and suffixes into feature dictionary
                p = 'p#'
                s = 's#'
                p = p + l[:i]
                s = s + l[i:]
                i += 1
                feature_dictionary[p] = 1
                feature_dictionary[s] = 1
            if language != '':
                if len(h.syllables(l)) > 2: #making features out of vowel(s) in penultimate syllable
                    m = 'm#'
                    vowels = ['a','e','i','o','u']
                    for letter in h.syllables(l)[-2]:
                        if letter in vowels:
                            m = m + letter 
                    feature_dictionary[m] = 1
            features.append(feature_dictionary)
        return features
    
    vectorizer = DictVectorizer(sparse = True)
    x = vectorizer.fit_transform(featurizer(featuresonly)) #vectorizing feature dictionary
    
    def classizer(training): #to get classes   
        classes = [l[0] for l in training]
        return classes
    
    y = classizer(X_train)
    
    classifier = svm.LinearSVC()
    classifier.fit(x,y)
    
    featuresonly_test = [l[1] for l in X_test] #features and classes of test set
    classes_test = [l[0] for l in X_test]
    
    to_predict = vectorizer.transform(featurizer(featuresonly_test))
    predicted_classes = classifier.predict(to_predict) #predicting classes for test set
    
    acc = accuracy_score(classes_test, predicted_classes)*100 #accuracy of predicted classes
    f1 = f1_score(classes_test,predicted_classes, average='weighted') 
    print('The accuracy of this classifier on ' + name + ' is ' + str(("%.2f" % acc)) + '%.')
    print('The f1_score of this classifier on ' + name + ' is ' + str(("%.2f" % f1)))

In [18]:
conj_decl_classifier(es_v,'Spanish verbs')

The accuracy of this classifier on Spanish verbs is 91.71%.
The f1_score of this classifier on Spanish verbs is 0.90


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [19]:
conj_decl_classifier(fi_na,'Finnish nouns')

The accuracy of this classifier on Finnish nouns is 83.50%.
The f1_score of this classifier on Finnish nouns is 0.83


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [20]:
conj_decl_classifier(fi_v,'Finnish verbs')

The accuracy of this classifier on Finnish verbs is 94.47%.
The f1_score of this classifier on Finnish verbs is 0.94


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [21]:
conj_decl_classifier(de_v,'German verbs')

The accuracy of this classifier on German verbs is 83.06%.
The f1_score of this classifier on German verbs is 0.79


  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


In [22]:
conj_decl_classifier(de_n,'German nouns')

The accuracy of this classifier on German nouns is 73.93%.
The f1_score of this classifier on German nouns is 0.71


  'precision', 'predicted', average, warn_for)
