In [2]:
import nltk
nltk.download('brown')
nltk.download('universal_tagset')
nltk.download('tagsets')

[nltk_data] Downloading package brown to /home/ankita/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package universal_tagset to
[nltk_data]     /home/ankita/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!
[nltk_data] Downloading package tagsets to /home/ankita/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


True

In [3]:
import pandas as pd
import numpy as np
from sklearn.utils import resample #downsample the dataset
from sklearn.model_selection import train_test_split,GridSearchCV 
from sklearn.preprocessing import scale #scale and center data
from sklearn.svm import SVC
from sklearn.preprocessing import MultiLabelBinarizer
from collections import defaultdict
from nltk.corpus import brown #Brown Corpus 
from sklearn import svm, datasets
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,LabelBinarizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from gensim.models import Word2Vec
import gensim
from nltk.tag import AffixTagger 
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from nltk import ConfusionMatrix
from itertools import chain 
from sklearn.ensemble import BaggingClassifier

In [4]:
class SVM_metrics():
    
    def __init__(self, test_actual_tags, test_predicted_tags):
        '''
        The test_actual_tags contains actual tags for the above setences.
        The test_predicted_tags contains predicted tags for the test data.
        '''
        self.counter_dict = defaultdict(lambda: defaultdict(lambda:0))
        self.test_actual_tags = test_actual_tags
        self.test_predicted_tags = test_predicted_tags
        self.tag_metrics = defaultdict(lambda: defaultdict(lambda:0))
            
    def calc_tag_metrics(self):
        '''
        Calculate the per-POS accuracy for all the tags in the tag-set
        '''
        counter_dict = defaultdict(lambda: defaultdict(lambda:0))
        
        for i in range(len(self.test_actual_tags)):
            
            if(self.test_actual_tags[i] == self.test_predicted_tags[i]):
                counter_dict[self.test_actual_tags[i]]['TP'] += 1
            else:
                counter_dict[self.test_actual_tags[i]]['FN']    += 1
                counter_dict[self.test_predicted_tags[i]]['FP'] += 1
        
        for tag in counter_dict.keys():
            counter_dict[tag]['TN'] = TOTAL_TAGGED_WORDS - counter_dict[tag]['TP']- counter_dict[tag]['FN'] - counter_dict[tag]['FP']
        
        for tag in counter_dict.keys():
            try:
                self.tag_metrics[tag]['Precision'] = counter_dict[tag]['TP']/(counter_dict[tag]['TP']+counter_dict[tag]['FP'])
                self.tag_metrics[tag]['Recall'] = counter_dict[tag]['TP']/(counter_dict[tag]['TP']+counter_dict[tag]['FN'])
                self.tag_metrics[tag]['F1_score'] = 2*(self.tag_metrics[tag]['Precision']*self.tag_metrics[tag]['Recall'])/(self.tag_metrics[tag]['Precision']+self.tag_metrics[tag]['Recall'])
                self.tag_metrics[tag]['Accuracy'] = (counter_dict[tag]['TP']+ counter_dict[tag]['TN']) / TOTAL_TAGGED_WORDS
            except ZeroDivisionError:
                continue               
        self.counter_dict =  counter_dict
        
    def generate_confusion_matrix(self):
        '''
        Generate confusion matrix for the particular fold
        '''
        CM = ConfusionMatrix(self.test_actual_tags ,self.test_predicted_tags)
        print(CM)
        
    def accuracy(self):
        '''
        Calculate average accuracy score
        '''
        TP =0
        FP =0
        for tag in self.counter_dict.keys():
            TP += self.counter_dict[tag]['TP']
            FP += self.counter_dict[tag]['FP']
        
        return TP/(TP+FP)
        
        
    def print_sample(self):
        '''
        Prints a sample of n = 5 actual and predicted tagged sentences for reference
        '''
        for i in range(5):
            print("Actual :",self.test_actual_tags[i])
            print("Predicted :",self.test_predicted_tags[i])
        
    def get_tag_metrics(self):
        '''
        Prints the per POS precision,recall and F1 score of predicted tags
        '''
        
        print ("{:<10} {:<10} {:<10} {:<10}".format('TAG', 'PRECISION', 'RECALL','F1_SCORE'))
        
        for key in self.tag_metrics.keys():
            precision = str(round(self.tag_metrics[key]['Precision'], 2))
            recall    = str(round(self.tag_metrics[key]['Recall'], 2))
            F1_score  = str(round(self.tag_metrics[key]['F1_score'], 2))
            accuracy  = str(round(self.tag_metrics[key]['Accuracy'], 2))
            print ("{:<10} {:<10} {:<10} {:<10} ".format(key, precision,recall,F1_score)) 

In [5]:
'''Constant declarations'''

UNIVERSAL_TAGSET =['NOUN', 'DET', 'ADJ', 'ADP', '.', 'VERB', 'CONJ', 'NUM', 'ADV', 'PRT', 'PRON', 'X']
TOTAL_TAGGED_WORDS = len(brown.words())
FOLDS = 5

# Suffixes

NOUN_SUFFIX =['eer','er','ion','ity','ment','ness','or','tion','ship','th']
ADJ_SUFFIX = ['able','ible,','al','ant','ary','ful','ic','ious','ous','ive','less','y']
VERB_SUFFIX = ['ed','en','er','ing','ise','ize']
ADVERB_SUFFIX = ['ly','ward','wise']



' Not using prefixes '

In [6]:
'''
Training Word2Vec model for word vectorisations using Brown corpus
'''

sentences = brown.sents()
model = Word2Vec(sentences, min_count=1,size=50)
wordvectors = model.wv

In [7]:
def check_suffix(word,suffix):
    '''
    Utility function to check if a string in the array passed is a suffix or not
    '''
    val = False
    for s in suffix:
        val = val or word.endswith(s)
    return val

In [8]:
def check_prefix(word,prefix):
    '''
    Utility function to check if a string in the array passed is a prefix or not
    '''
    val = False
    for s in prefix:
        val = val or word.startswith(s)
    return val

In [9]:
def output_pos_suffix(word):
    '''
    Output predicted part of speech based on suffixes
    '''
    if check_suffix(word,NOUN_SUFFIX):
        return "Noun"
    elif check_suffix(word,ADJ_SUFFIX):
        return "Adj"
    elif check_suffix(word,VERB_SUFFIX):
        return "Verb"
    elif check_suffix(word,ADVERB_SUFFIX):
        return "Adverb"
    else: 
        return "None"
    

In [10]:
def morphological_features(sentence_words, index):
    """ 
    Extract morphological features of the word occuring in the sentences
    """
    word = sentence_words[index]
    return {
        'total_words': len(sentence_words),
        'word': word,
        'is_first': index == 0,
        'is_last': index == len(sentence_words) - 1,
        'len_word': len(word),
        'is_capitalized': word[0].upper() == word[0],
        'all_capital': word.upper() == word,
        'all_lower': word.lower() == word,
        'suffix': output_pos_suffix(word.lower()),
        'next_word_suffix': "None" if index == len(sentence_words) - 1 else output_pos_suffix(sentence_words[index + 1].lower()),
        'prev_word_suffix': "None" if index == 0 else output_pos_suffix(sentence_words[index - 1].lower()),
        'prev_word_len' : 0 if index == 0 else len(sentence_words[index - 1]),
        'next_word_len' :0  if index == len(sentence_words) - 1 else len(sentence_words[index + 1]),
        'prev_word': '' if index == 0 else sentence_words[index - 1],
        'next_word': '' if index == len(sentence_words) - 1 else sentence_words[index + 1],
        'similarity_prev': 0 if index == 0 else model.similarity(word,sentence_words[index - 1]),
        'similarity_next': 0 if index == len(sentence_words) - 1 else model.similarity(word,sentence_words[index + 1]),
    }

In [11]:
'''
Getting brown corpus tagged sentences 
'''
sent_tag = brown.tagged_sents(tagset='universal')
mod_sent_tag=[]



In [12]:
'''
Dividing the test data into test words and test tags
'''
words=[]
tags=[]
for s in sent_tag:
  temp_word=[]
  temp_tag=[]
  for (w,t) in s:
    #temp_word.append(w.lower())
    temp_word.append(w)
    temp_tag.append(t)
  words.append(temp_word)
  tags.append(temp_tag)

In [13]:
'''
Getting the morphological features of the word and appending to one list 
'''
full_data_list =[]
for i in range(len(words)):
    for j in range(len(words[i])):
        dd = morphological_features(words[i],j)
        l =list( dd.values())
        l.extend([tags[i][j]])
        full_data_list.append(l)
            



In [36]:
'''
Converting the list to dataframe for application to SVM
'''
df = pd.DataFrame(full_data_list, columns =['total_words', 'word', 'is_first', 'is_last','len_word', 'is_capitalized', 'is_all_caps', 'is_all_lower', 'suffix','next_word_suffix','prev_word_suffix','prev_word_len','next_word_len','prev_word', 'next_word','sim_prev','sim_next','pred'])
#df = df.head(200000)


In [37]:
'''
Converting True/False values to binary format
'''

df['is_first']=  df["is_first"].astype(int)
df['is_last']=  df["is_last"].astype(int)
df['is_capitalized']=  df["is_capitalized"].astype(int)
df['is_all_caps']=  df["is_all_caps"].astype(int)
df['is_all_lower']=  df["is_all_lower"].astype(int)

In [38]:
df.head()

Unnamed: 0,total_words,word,is_first,is_last,len_word,is_capitalized,is_all_caps,is_all_lower,suffix,next_word_suffix,prev_word_suffix,prev_word_len,next_word_len,prev_word,next_word,sim_prev,sim_next,pred
0,25,The,1,0,3,1,0,0,,,,0,6,,Fulton,0.0,0.347621,DET
1,25,Fulton,0,0,6,1,0,0,,Adj,,3,6,The,County,0.347621,0.821048,NOUN
2,25,County,0,0,6,1,0,0,Adj,,,6,5,Fulton,Grand,0.821048,0.950832,NOUN
3,25,Grand,0,0,5,1,0,0,,Adj,Adj,6,4,County,Jury,0.950832,0.934176,ADJ
4,25,Jury,0,0,4,1,0,0,Adj,,,5,4,Grand,said,0.934176,0.40843,NOUN


In [39]:
'''
Seggregating the dataset to features and output (X and Y)
'''

X = df.drop('pred',axis =1).copy()
Y = df['pred'].copy()

In [40]:
'''
Applying word2vec embeddings to all the words that occur in the frame
'''

X['word'] = X['word'].apply(lambda x : wordvectors.word_vec(x))
X['prev_word'] = X['prev_word'].apply(lambda x : wordvectors.word_vec(x) if (x != '') else [''])
X['next_word']= X['next_word'].apply(lambda x : wordvectors.word_vec(x) if (x != '') else [''])

In [41]:
'''
Breaking down the word vector list values to individual columns
'''

T = pd.DataFrame(X.word.values.tolist()).add_prefix('word_') # Current Word vector
P = pd.DataFrame(X.prev_word.values.tolist()).add_prefix('prev_') # previous word vector
N = pd.DataFrame(X.next_word.values.tolist()).add_prefix('next_') # next word vector

In [42]:
'''
Concatenate word vectorisations 
Also drop string columns which are now vectorised
'''

X = pd.concat([X, T], axis=1) 
X = pd.concat([X, P], axis=1)
X = pd.concat([X, N], axis=1)
X = X.drop(['word', 'prev_word','next_word'], axis=1)
 

In [43]:
'''
Do one hot encoding of required columns
'''
X_encoded = pd.get_dummies(X, columns = ['suffix','next_word_suffix','prev_word_suffix'])

In [44]:
X_encoded.head()

Unnamed: 0,total_words,is_first,is_last,len_word,is_capitalized,is_all_caps,is_all_lower,prev_word_len,next_word_len,sim_prev,...,next_word_suffix_Adj,next_word_suffix_Adverb,next_word_suffix_None,next_word_suffix_Noun,next_word_suffix_Verb,prev_word_suffix_Adj,prev_word_suffix_Adverb,prev_word_suffix_None,prev_word_suffix_Noun,prev_word_suffix_Verb
0,25,1,0,3,1,0,0,0,6,0.0,...,0,0,1,0,0,0,0,1,0,0
1,25,0,0,6,1,0,0,3,6,0.347621,...,1,0,0,0,0,0,0,1,0,0
2,25,0,0,6,1,0,0,6,5,0.821048,...,0,0,1,0,0,0,0,1,0,0
3,25,0,0,5,1,0,0,6,4,0.950832,...,1,0,0,0,0,1,0,0,0,0
4,25,0,0,4,1,0,0,5,4,0.934176,...,0,0,1,0,0,0,0,1,0,0


In [45]:
'''
Encode POS tags to be predicted
'''

encoder = preprocessing.LabelEncoder()
encoder.fit(Y)
Y_encoded = encoder.transform(Y)


In [46]:
Y_encoded = pd.DataFrame(Y_encoded)
Y_encoded.head()

Unnamed: 0,0
0,5
1,6
2,6
3,1
4,6


In [47]:
''' 
Fill NaN with 0 
'''
X_encoded = X_encoded.apply(pd.to_numeric)
X_encoded = X_encoded.fillna(-2)


In [26]:
'''
Dumping data to pickle file in case the kernel restarts
'''
X_encoded.to_pickle("x.pkl")
Y_encoded.to_pickle("y.pkl")

In [27]:
'''
Reading back the encoded data.
Start from here and load packages if kernel restarts
'''
X_encoded = pd.read_pickle("x.pkl")
Y_encoded = pd.read_pickle("y.pkl")


In [48]:
'''
Seggregation for test sample so that we do not consider rows already used for train
'''
# X_encoded =X_encoded.tail(900000)
# Y_encoded= Y_encoded.tail(900000)

In [49]:
'''
Scaling X_encoded values
'''
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_encoded)

In [29]:
'''
Seggregating into train and test sets (Apply 5 fold later)
'''
X_train, X_test, y_train, y_test = train_test_split(X_train_scaled, Y_encoded, train_size=0.8, random_state = 0)

In [30]:
'''
Getting 1000 rows for each tag and storing them in a dataframe
Commented because we have not used this for final results
'''
'''
df_train = pd.concat([pd.DataFrame(X_train), pd.DataFrame(y_train)],axis=1)
df_equal_tag = pd.DataFrame() 
df =  df.sample(frac = 1)  #Shuffle dataframe
df_tag_divided = [pd.DataFrame(y).head(10000) for x, y in df_train.groupby(y_train[0], as_index=False)]
for i in range(len(df_tag_divided)):
    df_equal_tag = df_equal_tag.append(df_tag_divided[i])
X_train = X_train.head(20000)
y_train = y_train.head(20000)
X_test = X_test.head(1000)
y_test = y_test.head(1000)
df_equal_tag=df_equal_tag.dropna()
X_train = df_equal_tag.iloc[:, :-1]
y_train = df_equal_tag.iloc[:,-1]
X_train = scaler.fit_transform(X_train)
y_train = pd.DataFrame(y_train)
X_train
'''

In [31]:
'''
Training the SVM classifier 
'''

# params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
#                      'C': [1, 10, 100, 1000]},
#                     {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
            
params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3], 'C': [1000]}]

svm_model = GridSearchCV(SVC(), params_grid, cv=5)
# from sklearn.linear_model import SGDClassifier
# model = SGDClassifier(loss="hinge", penalty="l2",max_iter=1000)
# svm_model = BaggingClassifier(SVC(C=1000,
#         cache_size=200,
#         class_weight=None,
#         coef0=0.0,
#         degree=3,
#         gamma=1e-3,
#         kernel='rbf',
#         max_iter=-1,
#         probability=False,
#         random_state=None,
#         shrinking=True,
#         tol=0.001,
#         verbose=False,
#         ))

svm_model.fit(X_train,y_train)
#model.fit(X_train,y_train)

  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)
  return f(**kwargs)


GridSearchCV(cv=5, estimator=SVC(),
             param_grid=[{'C': [1000], 'gamma': [0.001], 'kernel': ['rbf']}])

In [50]:
'''
View the accuracy score
''' 
print('Best score for training data:', svm_model.best_score_,"\n") 

'''
View the best parameters for the model found using grid search
'''

print('Best C:',svm_model.best_estimator_.C,"\n") 
print('Best Kernel:',svm_model.best_estimator_.kernel,"\n")
print('Best Gamma:',svm_model.best_estimator_.gamma,"\n")

#final_model = svm_model.best_estimator_.fit(X_train,y_train)

#Y_pred = final_model.predict(X_test)

# print("MODEL SCORE: ",model.score(X_test,y_test))
#Y_pred = svm_model.predict(X_test)
Y_pred = svm_model.predict(X_train_scaled)
# '''
# Decoding the predicted labels
# '''

Y_pred_label = list(encoder.inverse_transform(Y_pred))

Best score for training data: 0.9422437499999999 

Best C: 1000 

Best Kernel: rbf 

Best Gamma: 0.001 



In [51]:
'''
Recalculating actual labels via decoding
'''

#Y_actual_label = list(encoder.inverse_transform(y_test))
Y_actual_label = list(encoder.inverse_transform(Y_encoded))

  return f(**kwargs)


In [52]:
'''
Generating confusion matrix and printing accuracies
'''
metrics = SVM_metrics(Y_actual_label,Y_pred_label)
metrics.calc_tag_metrics()
metrics.get_tag_metrics()
metrics.generate_confusion_matrix()
metrics.accuracy()*100

TAG        PRECISION  RECALL     F1_SCORE  
NOUN       0.9        0.94       0.92       
.          1.0        1.0        1.0        
CONJ       0.99       0.99       0.99       
PRON       0.99       0.96       0.97       
VERB       0.96       0.94       0.95       
DET        0.99       0.99       0.99       
ADV        0.88       0.8        0.84       
NUM        0.92       0.91       0.91       
ADP        0.97       0.97       0.97       
ADJ        0.76       0.77       0.77       
PRT        0.92       0.87       0.89       
X          0.43       0.09       0.14       
     |                                  C             N             P             V        |
     |             A      A      A      O      D      O      N      R      P      E        |
     |             D      D      D      N      E      U      U      O      R      R        |
     |      .      J      P      V      J      T      N      M      N      T      B      X |
-----+--------------------------------------

93.708