# IR Mini Project 2
Ali Ghanbari - 40110524

---

# Dataset

1. Download dataset if necessery:

In [1]:
!wget https: // cogcomp.seas.upenn.edu/Data/QA/QC/train_5500.label -O train.txt -nc
!wget https: // cogcomp.seas.upenn.edu/Data/QA/QC/TREC_10.label -O test.txt -nc


File 'train.txt' already there; not retrieving.
File 'test.txt' already there; not retrieving.


2. Define text preprocessor:

In [2]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Aligator\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Aligator\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Aligator\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import WordNetLemmatizer

english_stopwords = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemma = WordNetLemmatizer()

english_stopwords.remove('who')
english_stopwords.remove('where')
english_stopwords.remove('what')
english_stopwords.remove('how')


def preprocess_text(txt: str) -> str:
    tokens = word_tokenize(txt)
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w.lower() for w in tokens]
    tokens = [w for w in tokens if w not in english_stopwords]
    tokens = [stemmer.stem(w) for w in tokens]
    tokens = [lemma.lemmatize(w, pos = "v") for w in tokens]
    tokens = [lemma.lemmatize(w, pos = "n") for w in tokens]
    return ' '.join(tokens)

3. Define data parser:

In [4]:
# data(parent class, child class, query, vector)
from typing import Iterable
class QueryRow:
    def __init__(self, parent_class : str, child_class : str, query: str, vector) -> None:
        self.parent_class = parent_class
        self.child_class = child_class
        self.query = query
        self.vector = vector
    
    def __repr__(self) -> str:
        return f'{self.parent_class}:{self.child_class} {self.query} - {self.vector}'
        

def parse_line(line) -> QueryRow:
    spline = line.split()
    labels = spline[0]
    text = spline[1:-1]
    splbl = labels.split(':')
    parent_class = splbl[0]
    child_class = splbl[1]
    query = preprocess_text(' '.join(text))
    return QueryRow(parent_class, child_class, query, [])


4. Load and preprocess data:

In [5]:
train_data = []
test_data = []
with open('train.txt') as f:
    train_data = [parse_line(line) for line in f.readlines() if line]
with open('test.txt') as f:
    test_data = [parse_line(line) for line in f.readlines() if line]

# print empty queries
empty_queries = list(filter(lambda q: not q.query,train_data))
empty_queries

[]

5. Vectorieze queries:

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

tfidf_vec = TfidfVectorizer(sublinear_tf=False, use_idf=True, norm='l2')
train_queries = tfidf_vec.fit([q.query for q in train_data])

all_data = train_data + test_data
for q in all_data:
    q.vector = tfidf_vec.transform([q.query])[0]

np.shape(all_data)

(5952,)

---
# Single Level Classification

### Train-Test Split

In [7]:
from sklearn.model_selection import train_test_split
import numpy as np

def make_2d(arr):
    nsamples, nx, ny = arr.shape
    return arr.reshape((nsamples,nx*ny))

train_x = make_2d(np.array([q.vector.toarray() for q in train_data]))
train_y = [q.parent_class for q in train_data]
test_x = make_2d(np.array([q.vector.toarray() for q in test_data]))
test_y = [q.parent_class for q in test_data]

test_x.shape

(500, 6334)

### Benchmark

In [8]:
import time
from sklearn.metrics import precision_score, recall_score

def class_accuracy_score(true, pred, average=None):
    class_preds_tp = {}
    class_preds_fn = {}
    for i in range(len(true)):
        c = true[i]
        p = pred[i]
        if c == p:
            class_preds_tp[c] = class_preds_tp.get(c, 0) + 1
        else:
            class_preds_fn[c] = class_preds_fn.get(c, 0) + 1
    all_classes = set(class_preds_fn.keys()).union(class_preds_tp.keys())
    class_accuracy = {}
    for cls in all_classes:
        tp = class_preds_tp.get(cls, 0)
        fn = class_preds_fn.get(cls, 0)
        class_accuracy[cls] = tp / (tp + fn)
    if average == "macro":
        return np.average(list(class_accuracy.values()))
    elif average == "micro":
        return (np.sum(list(class_preds_tp.values()))) / (np.sum(list(class_preds_tp.values())) + np.sum(list(class_preds_fn.values())))
    return class_accuracy


benchmark_results = {}
# benchmark(name, accuracy macro, accuracy micro, precision macro, precision micro, recall macro, recall micro, time train, time test)
def benchmark_single_class(classifier, name: str):
    train_start = time.process_time()
    classifier.fit(train_x, train_y)
    train_end = time.process_time()
    train_time = train_end - train_start
    test_start = time.process_time()
    test_pred = classifier.predict(test_x)
    test_end = time.process_time()
    test_time = test_end - test_start
    acc_macro = class_accuracy_score(test_y, test_pred, average="macro")
    acc_micro = class_accuracy_score(test_y, test_pred, average="micro")
    pre_macro = precision_score(test_y, test_pred, average='macro', zero_division=0)
    pre_micro = precision_score(test_y, test_pred, average='micro', zero_division=0)
    rec_macro = recall_score(test_y, test_pred, average='macro', zero_division=0)
    rec_micro = recall_score(test_y, test_pred, average='micro', zero_division=0)
    benchmark_results[name] = (name, acc_macro, acc_micro, pre_macro, pre_micro, rec_macro, rec_micro, train_time, test_time)


1. Naïve Bayes(Bernoulli)

In [9]:
from sklearn.naive_bayes import BernoulliNB

bnb = BernoulliNB()
benchmark_single_class(bnb, 'Naïve Bayes(Bernoulli)')

2. Naïve Bayes(Multinomial):

In [10]:
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
benchmark_single_class(mnb, 'Naïve Bayes(Multinomial)')

3. KNN(k=3):

In [11]:
from sklearn.neighbors import KNeighborsClassifier

knn3 = KNeighborsClassifier(3)
benchmark_single_class(knn3, 'KNN(k=3)')

4. KNN(k=4):

In [12]:
from sklearn.neighbors import KNeighborsClassifier

knn4 = KNeighborsClassifier(4)
benchmark_single_class(knn4, 'KNN(k=4)')

5. KNN(k=5):

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn5 = KNeighborsClassifier(5)
benchmark_single_class(knn5, 'KNN(k=5)')

6. SVM(Gaussian kernel):

In [14]:
from sklearn.svm import SVC

svmg = SVC(kernel='rbf')
# benchmark_single_class(svmg, 'SVM(Gaussian kernel)')

7. SVM(Linear kernel):

In [15]:
from sklearn.svm import SVC

svml = SVC(kernel='linear')
# benchmark_single_class(svml, 'SVM(Linear kernel)[libsvm]')

In [26]:
from sklearn.svm import LinearSVC

svml2 = LinearSVC(max_iter=5000)
benchmark_single_class(svml2, 'SVM(Linear kernel)[liblinear]')

### Evaluation Table

In [27]:
import pandas as pd

sc_df = pd.DataFrame(data=list(benchmark_results.values()),
                     columns=['name', 'accuracy macro', 'accuracy micro', 'precision macro', 'precision micro', 'recall macro', 'recall micro', 'time train', 'time test'])

sc_df

Unnamed: 0,name,accuracy macro,accuracy micro,precision macro,precision micro,recall macro,recall micro,time train,time test
0,Naïve Bayes(Bernoulli),0.553538,0.652,0.598048,0.652,0.553538,0.652,0.28125,0.0
1,Naïve Bayes(Multinomial),0.562244,0.608,0.549397,0.608,0.562244,0.608,0.09375,0.0
2,KNN(k=3),0.636994,0.622,0.634042,0.622,0.636994,0.622,0.0,5.34375
3,KNN(k=4),0.662911,0.65,0.669021,0.65,0.662911,0.65,0.125,4.84375
4,KNN(k=5),0.694293,0.688,0.717931,0.688,0.694293,0.688,0.015625,4.40625
5,SVM(Linear kernel)[liblinear],0.818435,0.818,0.846587,0.818,0.818435,0.818,0.09375,0.0


---

# 2 Level Classification

In [25]:
from sklearn.svm import LinearSVC

lsvm = LinearSVC(max_iter=5000)
lsvm.fit(train_x, train_y)

. Group data by parent class

In [19]:
sub_data = {
    'ABBR' : [],
    'DESC' : [],
    'ENTY' : [],
    'HUM' : [],
    'LOC' : [],
    'NUM' : [],
}

for row in train_data:
    sub_data[row.parent_class].append(row)

In [20]:
from sklearn.svm import LinearSVC

sub_classifiers = {
    'ABBR' : LinearSVC(max_iter=5000),
    'DESC' : LinearSVC(max_iter=5000),
    'ENTY' : LinearSVC(max_iter=5000),
    'HUM' : LinearSVC(max_iter=5000),
    'LOC' : LinearSVC(max_iter=5000),
    'NUM' : LinearSVC(max_iter=5000),
}

for parent_class, classifier in sub_classifiers.items():
    rows = sub_data[parent_class]
    data = make_2d(np.array([q.vector.toarray() for q in rows]))
    labels = [q.child_class for q in rows]
    classifier.fit(data, labels)

. Test:

In [21]:
from sklearn.metrics import classification_report
import pandas as pd

predicted_parent_classes = lsvm.predict(test_x)

parent_results = classification_report(test_y, predicted_parent_classes, zero_division=0, output_dict=True)
p_acc = class_accuracy_score(test_y, predicted_parent_classes)
for cls, acc in p_acc.items():
    parent_results[cls]['accuracy'] = acc

parants_df = pd.DataFrame.from_dict(parent_results)
parants_df

Unnamed: 0,ABBR,DESC,ENTY,HUM,LOC,NUM,accuracy,macro avg,weighted avg
precision,1.0,0.804054,0.707865,0.756098,0.865854,0.945652,0.818,0.846587,0.825276
recall,0.777778,0.862319,0.670213,0.953846,0.876543,0.769912,0.818,0.818435,0.818
f1-score,0.875,0.832168,0.688525,0.843537,0.871166,0.84878,0.818,0.826529,0.817484
support,9.0,138.0,94.0,65.0,81.0,113.0,0.818,500.0,500.0
accuracy,0.777778,0.862319,0.670213,0.953846,0.876543,0.769912,0.818,,


In [22]:
sub_test_x = {
    'ABBR' : [],
    'DESC' : [],
    'ENTY' : [],
    'HUM' : [],
    'LOC' : [],
    'NUM' : [],
}

sub_test_y = {
    'ABBR' : [],
    'DESC' : [],
    'ENTY' : [],
    'HUM' : [],
    'LOC' : [],
    'NUM' : [],
}


for i in range(len(test_x)):
    vector = test_x[i]
    sub_class = test_data[i].child_class
    predicted_parent_class = predicted_parent_classes[i]
    sub_test_x[predicted_parent_class].append(vector)
    sub_test_y[predicted_parent_class].append(sub_class)

In [29]:
from sklearn.metrics import precision_recall_fscore_support, precision_score, classification_report
from sklearn.utils.multiclass import unique_labels
import pandas as pd

sub_pred_test_y = {}
sub_rows = []

for parent_class, queries in sub_test_x.items():
    classifier = sub_classifiers[parent_class]
    sub_pred_test_y[parent_class] = classifier.predict(queries)

for parent_class, pred_y in sub_pred_test_y.items():
    accuracy = class_accuracy_score(sub_test_y[parent_class], pred_y)
    labels = unique_labels(sub_test_y[parent_class], pred_y)
    precision, recall, fscore, support = precision_recall_fscore_support(sub_test_y[parent_class], pred_y, zero_division=0,)
    rows = list(zip([parent_class for p in range(len(labels))], labels, [accuracy.get(lbl, -1) for lbl in labels], precision, recall, support))
    sub_rows = sub_rows + rows

sub_df = pd.DataFrame(data=sub_rows, columns=['Parent Class', 'Child Class', 'Accuracy', 'Precision', 'Recall', 'Support'])
sub_df.to_csv('sub_classes.csv')
sub_df

Unnamed: 0,Parent Class,Child Class,Accuracy,Precision,Recall,Support
0,ABBR,abb,0.000000,0.000000,0.000000,1
1,ABBR,exp,1.000000,0.857143,1.000000,6
2,DESC,animal,0.000000,0.000000,0.000000,2
3,DESC,date,0.000000,0.000000,0.000000,3
4,DESC,def,0.990909,0.865079,0.990909,110
...,...,...,...,...,...,...
73,NUM,reason,0.000000,0.000000,0.000000,1
74,NUM,speed,0.800000,1.000000,0.800000,5
75,NUM,state,0.000000,0.000000,0.000000,1
76,NUM,temp,1.000000,1.000000,1.000000,4


---

# Clustering using K-means

In [None]:
location_train_rows = sub_data['LOC']
location_train_x = make_2d(np.array([q.vector.toarray() for q in location_train_rows]))
location_train_y = [q.child_class for q in location_train_rows]

location_test_x = sub_test_x['LOC']
location_test_y = sub_test_y['LOC']

In [None]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
import numpy as np

def cluster_precision(labels_x, cluster_x, cluster_y):
    """returns lebels_y"""
    classifier = LinearSVC(max_iter=3000)
    transformer = {}
    i = 0
    for label in set(labels_x):
        transformer[label] = i
        i = i + 1
    reverse_transform = {transformer[key]: key for key in transformer.keys()}
    labels_x_vec = list(map(lambda l: transformer[l], labels_x))
    classifier.fit(np.array(cluster_x).reshape(-1, 1), labels_x_vec)
    labels_y_vec = classifier.predict(np.array(cluster_y).reshape(-1, 1))
    labels_y = [reverse_transform[v] for v in labels_y_vec]
    return labels_y

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, recall_score, precision_score, rand_score
import pandas as pd 

kmeans_results = {}
for n_clusters in range(3, 8):
    kmeans = KMeans(n_clusters=n_clusters, random_state=0, n_init=100, max_iter=3000)
    cluster_x = kmeans.fit_predict(location_train_x)
    location_pred_y = kmeans.predict(location_test_x)
    labels_y = cluster_precision(location_train_y, cluster_x, location_pred_y)
    kmeans_results[f'k={n_clusters}'] = {
        'recall' : recall_score(location_test_y, labels_y, average="macro", zero_division=0),
        'precision' : precision_score(location_test_y, labels_y, average="macro", zero_division=0),
        'rand_index' : rand_score(location_test_y, location_pred_y),
        'silhouette' : silhouette_score(location_test_x, location_pred_y),
    }
kmeans_df = pd.DataFrame(kmeans_results)
kmeans_df

Unnamed: 0,k=3,k=4,k=5,k=6,k=7
recall,0.222222,0.111111,0.219697,0.222222,0.219697
precision,0.172996,0.059621,0.093532,0.172996,0.096264
rand_index,0.533875,0.618187,0.695875,0.684734,0.730503
silhouette,0.051324,0.062125,0.0747,0.076049,0.112966
