# IR Mini Project 2
Ali Ghanbari - 970216657

---

# Dataset

1. Download dataset if necessery:

In [312]:
!wget https: // cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label -O train.txt -nc
!wget https: // cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label -O test.txt -nc


File 'train.txt' already there; not retrieving.
File 'test.txt' already there; not retrieving.


2. Define text preprocessor:

In [313]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aligator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aligator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aligator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [314]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

english_stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

english_stopwords.remove('who')
english_stopwords.remove('where')
english_stopwords.remove('what')
english_stopwords.remove('how')


def preprocess_text(txt: str) -> str:
    tokens = word_tokenize(txt)
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w.lower() for w in tokens]
    tokens = [w for w in tokens if w not in english_stopwords]
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [lemma.lemmatize(w, pos = "v") for w in tokens]
    tokens = [lemma.lemmatize(w, pos = "n") for w in tokens]
    return ' '.join(tokens)

3. Define data parser:

In [315]:
# data(parent class, child class, query, vector)
from typing import Iterable
class QueryRow:
    def __init__(self, parent_class : str, child_class : str, query: str, vector) -> None:
        self.parent_class = parent_class
        self.child_class = child_class
        self.query = query
        self.vector = vector
    
    def __repr__(self) -> str:
        return f'{self.parent_class}:{self.child_class} {self.query} - {self.vector}'
        

def parse_line(line) -> QueryRow:
    spline = line.split()
    labels = spline[0]
    text = spline[1:-1]
    splbl = labels.split(':')
    parent_class = splbl[0]
    child_class = splbl[1]
    query = preprocess_text(' '.join(text))
    return QueryRow(parent_class, child_class, query, [])


4. Load and preprocess data:

In [316]:
train_data = []
test_data = []
with open('train.txt') as f:
    train_data = [parse_line(line) for line in f.readlines() if line]
with open('test.txt') as f:
    test_data = [parse_line(line) for line in f.readlines() if line]

# print empty queries
empty_queries = list(filter(lambda q: not q.query,train_data))
empty_queries

[]

5. Vectorieze queries:

In [317]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

tfidf_vec = TfidfVectorizer(sublinear_tf=False, use_idf=True, norm='l2')
train_queries = tfidf_vec.fit([q.query for q in train_data])

all_data = train_data + test_data
for q in all_data:
    q.vector = tfidf_vec.transform([q.query])[0]

np.shape(all_data)

(5952,)

---
# Single Level Classification

### Train-Test Split

In [318]:
from sklearn.model_selection import train_test_split
import numpy as np

def make_2d(arr):
    nsamples, nx, ny = arr.shape
    return arr.reshape((nsamples,nx*ny))

train_x = make_2d(np.array([q.vector.toarray() for q in train_data]))
train_y = [q.parent_class for q in train_data]
test_x = make_2d(np.array([q.vector.toarray() for q in test_data]))
test_y = [q.parent_class for q in test_data]

test_x.shape

(500, 6334)

### Benchmark

In [376]:
import time
from sklearn.metrics import precision_score, recall_score

def class_accuracy_score(true, pred, average=None):
    class_preds_tp = {}
    class_preds_fn = {}
    for i in range(len(true)):
        c = true[i]
        p = pred[i]
        if c == p:
            class_preds_tp[c] = class_preds_tp.get(c, 0) + 1
        else:
            class_preds_fn[c] = class_preds_fn.get(c, 0) + 1
    all_classes = set(class_preds_fn.keys()).union(class_preds_tp.keys())
    class_accuracy = {}
    for cls in all_classes:
        tp = class_preds_tp.get(cls, 0)
        fn = class_preds_fn.get(cls, 0)
        class_accuracy[cls] = tp / (tp + fn)
    if average == "micro":
        return np.average(list(class_accuracy.values()))
    elif average == "macro":
        return (np.sum(list(class_preds_tp.values()))) / (np.sum(list(class_preds_tp.values())) + np.sum(list(class_preds_fn.values())))
    return class_accuracy


benchmark_results = {}
# benchmark(name, accuracy macro, accuracy micro, precision macro, precision micro, recall macro, recall micro, time train, time test)
def benchmark_single_class(classifier, name: str):
    train_start = time.process_time()
    classifier.fit(train_x, train_y)
    train_end = time.process_time()
    train_time = train_end - train_start
    test_start = time.process_time()
    test_pred = classifier.predict(test_x)
    test_end = time.process_time()
    test_time = test_end - test_start
    acc_macro = class_accuracy_score(test_y, test_pred, average="macro")
    acc_micro = class_accuracy_score(test_y, test_pred, average="micro")
    pre_macro = precision_score(test_y, test_pred, average='macro', zero_division=1)
    pre_micro = precision_score(test_y, test_pred, average='micro', zero_division=1)
    rec_macro = recall_score(test_y, test_pred, average='macro', zero_division=1)
    rec_micro = recall_score(test_y, test_pred, average='micro', zero_division=1)
    benchmark_results[name] = (name, acc_macro, acc_micro, pre_macro, pre_micro, rec_macro, rec_micro, train_time, test_time)


1. Naïve Bayes(Bernoulli)

In [377]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
benchmark_single_class(bnb, 'Naïve Bayes(Bernoulli)')

2. Naïve Bayes(Multinomial):

In [378]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
benchmark_single_class(mnb, 'Naïve Bayes(Multinomial)')

3. KNN(k=3):

In [379]:
from sklearn.neighbors import KNeighborsClassifier

knn3 = KNeighborsClassifier(3)
benchmark_single_class(knn3, 'KNN(k=3)')

4. KNN(k=4):

In [380]:
from sklearn.neighbors import KNeighborsClassifier

knn4 = KNeighborsClassifier(4)
benchmark_single_class(knn4, 'KNN(k=4)')

5. KNN(k=5):

In [381]:
from sklearn.neighbors import KNeighborsClassifier

knn5 = KNeighborsClassifier(5)
benchmark_single_class(knn5, 'KNN(k=5)')

6. SVM(Gaussian kernel):

In [382]:
from sklearn.svm import SVC

svmg = SVC(kernel='rbf')
# benchmark_single_class(svmg, 'SVM(Gaussian kernel)')

7. SVM(Linear kernel):

In [383]:
from sklearn.svm import SVC

svml = SVC(kernel='linear')
# benchmark_single_class(svml, 'SVM(Linear kernel)[libsvm]')

In [384]:
from sklearn.svm import LinearSVC

svml2 = LinearSVC()
benchmark_single_class(svml2, 'SVM(Linear kernel)[liblinear]')

### Evaluation Table

In [386]:
import pandas as pd

sc_df = pd.DataFrame(data=list(benchmark_results.values()),
                     columns=['name', 'accuracy macro', 'accuracy micro', 'precision macro', 'precision micro', 'recall macro', 'recall micro', 'time train', 'time test'])

sc_df

Unnamed: 0,name,accuracy macro,accuracy micro,precision macro,precision micro,recall macro,recall micro,time train,time test
0,Naïve Bayes(Bernoulli),0.652,0.553538,0.764714,0.652,0.553538,0.652,0.34375,0.046875
1,Naïve Bayes(Multinomial),0.608,0.562244,0.716064,0.608,0.562244,0.608,0.15625,0.0
2,KNN(k=3),0.622,0.636994,0.634042,0.622,0.636994,0.622,0.015625,4.921875
3,KNN(k=4),0.65,0.662911,0.669021,0.65,0.662911,0.65,0.0,5.25
4,KNN(k=5),0.688,0.694293,0.717931,0.688,0.694293,0.688,0.234375,4.921875
5,SVM(Linear kernel)[liblinear],0.818,0.818435,0.846587,0.818,0.818435,0.818,0.109375,0.015625


---

# 2 Level Classification

In [329]:
from sklearn.svm import LinearSVC

lsvm = LinearSVC()
lsvm.fit(train_x, train_y)

. Group data by parent class

In [330]:
sub_data = {
    'ABBR' : [],
    'DESC' : [],
    'ENTY' : [],
    'HUM' : [],
    'LOC' : [],
    'NUM' : [],
}

for row in train_data:
    sub_data[row.parent_class].append(row)

In [331]:
from sklearn.svm import LinearSVC

sub_classifiers = {
    'ABBR' : LinearSVC(),
    'DESC' : LinearSVC(),
    'ENTY' : LinearSVC(),
    'HUM' : LinearSVC(),
    'LOC' : LinearSVC(),
    'NUM' : LinearSVC(),
}

for parent_class, classifier in sub_classifiers.items():
    rows = sub_data[parent_class]
    data = make_2d(np.array([q.vector.toarray() for q in rows]))
    labels = [q.child_class for q in rows]
    classifier.fit(data, labels)

. Test:

In [390]:
predicted_parent_classes = lsvm.predict(test_x)

from sklearn.metrics import classification_report
print(classification_report(test_y, predicted_parent_classes))
print(f'accuracy: ' + str(class_accuracy_score(test_y, predicted_parent_classes)))

              precision    recall  f1-score   support

        ABBR       1.00      0.78      0.88         9
        DESC       0.80      0.86      0.83       138
        ENTY       0.71      0.67      0.69        94
         HUM       0.76      0.95      0.84        65
         LOC       0.87      0.88      0.87        81
         NUM       0.95      0.77      0.85       113

    accuracy                           0.82       500
   macro avg       0.85      0.82      0.83       500
weighted avg       0.83      0.82      0.82       500

accuracy: {'NUM': 0.7699115044247787, 'ABBR': 0.7777777777777778, 'HUM': 0.9538461538461539, 'LOC': 0.8765432098765432, 'ENTY': 0.6702127659574468, 'DESC': 0.8623188405797102}


In [333]:
sub_test_x = {
    'ABBR' : [],
    'DESC' : [],
    'ENTY' : [],
    'HUM' : [],
    'LOC' : [],
    'NUM' : [],
}

sub_test_y = {
    'ABBR' : [],
    'DESC' : [],
    'ENTY' : [],
    'HUM' : [],
    'LOC' : [],
    'NUM' : [],
}


for i in range(len(test_x)):
    vector = test_x[i]
    sub_class = test_data[i].child_class
    predicted_parent_class = predicted_parent_classes[i]
    sub_test_x[predicted_parent_class].append(vector)
    sub_test_y[predicted_parent_class].append(sub_class)

In [389]:
sub_pred_test_y = {}

for parent_class, queries in sub_test_x.items():
    classifier = sub_classifiers[parent_class]
    sub_pred_test_y[parent_class] = classifier.predict(queries)

for parent_class, pred_y in sub_pred_test_y.items():
    print(classification_report(sub_test_y[parent_class], pred_y, zero_division=1))
    print(f'accuracy: ' + str(class_accuracy_score(sub_test_y[parent_class], pred_y)))

              precision    recall  f1-score   support

         abb       1.00      0.00      0.00         1
         exp       0.86      1.00      0.92         6

    accuracy                           0.86         7
   macro avg       0.93      0.50      0.46         7
weighted avg       0.88      0.86      0.79         7

accuracy: {'abb': 0.0, 'exp': 1.0}
              precision    recall  f1-score   support

      animal       1.00      0.00      0.00         2
        date       1.00      0.00      0.00         3
         def       0.87      0.99      0.92       110
        desc       0.29      0.40      0.33         5
        dist       1.00      0.00      0.00         1
         exp       1.00      0.00      0.00         2
        food       1.00      0.00      0.00         1
         ind       1.00      0.00      0.00         1
      manner       0.40      1.00      0.57         2
       other       1.00      0.00      0.00         6
     product       1.00      0.00      0.00

---

# Clustering using K-means

In [397]:
location_train_rows = sub_data['LOC']
location_train_x = make_2d(np.array([q.vector.toarray() for q in location_train_rows]))
location_train_y = [q.child_class for q in location_train_rows]

location_test_x = sub_test_x['LOC']
location_test_y = sub_test_y['LOC']

In [396]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, recall_score, precision_score, rand_score

for n_clusters in range(3, 8):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=100)
    kmeans.fit(location_train_x, location_train_y)
    location_pred_y = kmeans.predict(location_test_x)
    recall = 0.0#recall_score(location_test_y, location_pred_y)
    precision = 0.0#precision_score(location_test_y, location_pred_y)
    rand_index = rand_score(location_test_y, location_pred_y)
    silhouette = silhouette_score(location_test_x, location_pred_y)
    print(f'k={n_clusters}: recall={recall}, precision={precision}, rand index={rand_index}, silhouette={silhouette}')

k=3: recall=0.0, precision=0.0, rand index=0.5338753387533876, silhouette=0.05132416569241707
k=4: recall=0.0, precision=0.0, rand index=0.618187292984041, silhouette=0.062125107274616474
k=5: recall=0.0, precision=0.0, rand index=0.695874736525143, silhouette=0.07470047495519952
k=6: recall=0.0, precision=0.0, rand index=0.6847335140018067, silhouette=0.0760485363012754
k=7: recall=0.0, precision=0.0, rand index=0.7305028605841614, silhouette=0.1129662942919057
