In [54]:
import numpy as np
import pandas as pd
import string
import pickle

import sklearn
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import sent_tokenize

## Data Loading and Splits

In [55]:
cols = ['question', 'review0', 'review1', 'review2', 'review3', 'review4', 'Is_Answerable']
df = pd.read_csv('classification_data/train.csv')[cols]

def filter_data(df):
    df = df[df.Is_Answerable.notnull()]
    df = df[df.review0.notnull()]
    return df

print(df.Is_Answerable.value_counts())
print('Total', len(df))

N    187
S     69
Y     63
A     10
Name: Is_Answerable, dtype: int64
Total 329


In [56]:
df = pd.read_csv('classification_data/final_annotated_data.csv')

def print_stats(df, name='df'):
    print(name.upper())
    print('Length = %d' % len(df))
    print('IsAnswerableOrig Counts')
    print(df.is_answerable_orig.value_counts())
    print('IsAnswerableNew Counts')
    print(df.is_answerable_new.value_counts())
    print()

In [57]:
train_df = df[~df.expert]
test_df = df[df.expert]
test_df = test_df[test_df.is_answerable_new.notnull()]

print_stats(train_df, 'train')
print_stats(test_df, 'test')

TRAIN
Length = 3348
IsAnswerableOrig Counts
3    1436
2     958
1     954
Name: is_answerable_orig, dtype: int64
IsAnswerableNew Counts
Series([], Name: is_answerable_new, dtype: int64)

TEST
Length = 281
IsAnswerableOrig Counts
3    73
2    56
N    33
Y    24
1    17
S    12
Name: is_answerable_orig, dtype: int64
IsAnswerableNew Counts
3.0    134
2.0     77
1.0     70
Name: is_answerable_new, dtype: int64



In [58]:
# Column Renames

train_df = train_df.rename(columns={'is_answerable_orig': 'Is_Answerable'})[cols]
test_df = test_df.rename(columns={'is_answerable_new': 'Is_Answerable', 'is_answerable_orig': 'Is_Answerable_Human'})[cols + ['Is_Answerable_Human']]
test_df_human = test_df[test_df.Is_Answerable_Human.notnull()]

train_df = filter_data(train_df)
test_df = filter_data(test_df)
print(len(train_df))
print(len(test_df))

3303
127


In [59]:
train_df, validate_df = np.split(train_df.sample(frac=1), [int(.8*len(train_df))])

In [60]:
len(train_df), len(validate_df), len(test_df)

(2642, 661, 127)

In [61]:
def labels(df):
    df['label'] = df.Is_Answerable.apply(lambda x: label_map[x])
    return df['label']

def get_reviews(row):
    all_reviews = ''
    for key in ['review0', 'review1', 'review2', 'review3', 'review4']:
        if not isinstance(row[key], float):
            all_reviews += row[key].strip(' ').strip('-')
            all_reviews += ' '
    return all_reviews.strip()

def add_reviews(df):
    df['reviews'] = df.apply(lambda x: get_reviews(x), axis = 1)
    return df

label_map = {'N': 0, 'S': 1, 'Y': 1, 'A': 1}
for i, j in zip([1, 2, 3], [1, 1, 0]):
    label_map[i] = j
    label_map[str(i)] = j

train_df, test_df, validate_df = list(map(add_reviews, [train_df, test_df, validate_df]))
train_labels, test_labels, validate_labels = list(map(labels, [train_df, test_df, validate_df]))

## TF IDF Vectorizer

In [62]:
with open('classification_data/q_classification_vectorizer.pkl', 'rb') as fp:
    vectorizer = pickle.load(fp)

In [63]:
len(vectorizer.vocabulary_)

150976

In [64]:
# df = pd.read_csv('samples/train-qar_sample_100000.csv')
# df = df[df.question.notnull()]
# df = df[df.review0.notnull()]
# df = add_reviews(df)
# print(len(df))

# vectorizer = TfidfVectorizer(stop_words='english')
# vectorizer.fit(list(df.question.values) + list(df.reviews.values))

In [65]:
# with open('classification_data/q_classification_vectorizer.pkl', 'wb') as fp:
#     pickle.dump(vectorizer, fp)

## Features

In [66]:
df = train_df

In [67]:
train_df_raw, test_df_raw, validate_df_raw = list(map(lambda x: x.copy(), [train_df, test_df, validate_df]))

In [48]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer(stop_words='english')
# vectorizer.fit(\
#     list(train_df.question.values) + list(train_df.reviews.values) \
#     + list(validate_df.question.values) + list(validate_df.reviews.values))

In [68]:
def tokenize(text):
    punctuations = string.punctuation.replace("\'", '')

    for ch in punctuations:
        text = text.replace(ch, " " + ch + " ")

    tokens = text.split()
    for i, token in enumerate(tokens):
        if not token.isupper():
            tokens[i] = token.lower()
    return tokens

In [70]:
def n_intersection(q, r):
    return len(set(q).intersection(set(r)))

def tf_idf_sim(q, r):
    # dot product of q and r as tfidf vectors
    q_vec = vectorizer.transform([q])
    r_vec = vectorizer.transform([r])
    return q_vec.dot(r_vec.transpose()).toarray()[0][0]

def tf_idf_sim_sentence(q, r):
    # max of dot products of q and each sentence in r as tfidf vectors
    q_vec = vectorizer.transform([q])
    rs = sent_tokenize(r)
    if len(rs) == 0:
        return 0
    return max([q_vec.dot(vectorizer.transform([r]).transpose()).toarray()[0][0] for r in rs])

def add_features(df):
    df['q_tokens'] = df.question.apply(lambda x: tokenize(x))
    df['r_tokens'] = df.reviews.apply(lambda x: tokenize(x))
    df['n_q'] = df.q_tokens.apply(lambda x: len(x))
    df['n_r'] = df.r_tokens.apply(lambda x: len(x))
    df['n_intersection'] = df.apply(lambda x: len(set(x.q_tokens).intersection(set(x.r_tokens))), axis=1)
    df['intr_frac'] = df.n_intersection / df.n_q
    df['tfidf'] = df.apply(lambda x: tf_idf_sim(x.question, x.reviews), axis=1)
    df['tfidf_sent'] = df.apply(lambda x: tf_idf_sim_sentence(x.question, x.reviews), axis=1)
    return df

In [71]:
train_df, test_df, validate_df = list(map(add_features, [train_df_raw, test_df_raw, validate_df_raw]))

## Analysis

In [72]:
df = train_df

q = df.question.iloc[1]
r = df.reviews.iloc[1]
q_tokens = df.q_tokens.iloc[1]
r_tokens = df.r_tokens.iloc[1]

# q, r, set(q_tokens).intersection(set(r_tokens))

In [73]:
df[['label', 'n_q', 'n_r', 'n_intersection', 'intr_frac', 'tfidf', 'tfidf_sent']].groupby('label').mean()

Unnamed: 0_level_0,n_q,n_r,n_intersection,intr_frac,tfidf,tfidf_sent
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15.956211,324.182306,7.151028,0.463667,0.122266,0.18546
1,17.475378,377.079448,9.532502,0.581627,0.22708,0.325191


## Training

In [74]:
X_cols = ['n_q', 'n_r', 'n_intersection', 'intr_frac', 'tfidf']
Y_cols = 'label'

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

model = LogisticRegression(C=100)
# model = DecisionTreeClassifier()
# model = RandomForestClassifier(n_estimators=5)
model.fit(train_df[X_cols].values, train_df[Y_cols].values)



LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [75]:
(model.predict(validate_df[X_cols].values) == validate_df[Y_cols].values).mean()

0.6671709531013615

In [76]:
predictions = model.predict(validate_df[X_cols].values)
print(pd.Series(predictions).value_counts())
print(classification_report(validate_df[Y_cols].values, predictions))

1    447
0    214
dtype: int64
              precision    recall  f1-score   support

           0       0.64      0.49      0.56       282
           1       0.68      0.80      0.73       379

   micro avg       0.67      0.67      0.67       661
   macro avg       0.66      0.64      0.65       661
weighted avg       0.66      0.67      0.66       661



## Testing

In [77]:
from sklearn.metrics import classification_report

predictions = model.predict(test_df[X_cols].values)
print(pd.Series(predictions).value_counts())
print(classification_report(test_df[Y_cols].values, predictions))

1    93
0    34
dtype: int64
              precision    recall  f1-score   support

           0       0.65      0.42      0.51        53
           1       0.67      0.84      0.74        74

   micro avg       0.66      0.66      0.66       127
   macro avg       0.66      0.63      0.62       127
weighted avg       0.66      0.66      0.64       127



## Human Performance

In [78]:
predictions = test_df_human.Is_Answerable_Human.apply(lambda x: label_map[x]).values
labels = test_df_human.Is_Answerable.apply(lambda x: label_map[x]).values
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

           0       0.76      0.77      0.77       105
           1       0.78      0.77      0.78       110

   micro avg       0.77      0.77      0.77       215
   macro avg       0.77      0.77      0.77       215
weighted avg       0.77      0.77      0.77       215



## Saving

In [25]:
# with open('q_classification_model_2.pkl', 'wb') as fp:
#     pickle.dump(model, fp)

In [26]:
# with open('classification_data/q_classification_model.pkl', 'rb') as fp:
#     model = pickle.load(fp)