In [1]:
import numpy as np
import pandas as pd
import string
import pickle

import sklearn
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import sent_tokenize

In [2]:
sklearn.__version__

'0.20.1'

## Data Loading and Splits

In [32]:
filename = '../../data/final_annotated_data.csv'
df = pd.read_csv(filename)

print(df.is_answerable_orig.value_counts())
print('Total', len(df))

N    1754
Y    1094
S    1074
Name: is_answerable_orig, dtype: int64
Total 3922


In [33]:
# Column Renames
cols = ['question', 'review0', 'review1', 'review2', 'review3', 'review4', 'is_answerable', 'is_answerable_human', 'expert']
df = df.rename(columns={'is_answerable_orig': 'is_answerable', 'is_answerable_new': 'is_answerable_human'})[cols]

def filter_data(df):
    df = df[df.is_answerable.notnull()]
    df = df[df.review0.notnull()]
    return df

L = len(df)
df = filter_data(df)
assert L == len(df)


label_map = {'N': 0, 'S': 1, 'Y': 1}

def label_func(df):
    df['label'] = df.is_answerable.apply(lambda x: label_map[x])

label_func(df)

In [34]:
def print_stats(df, name='df'):
    print(name.upper())
    print('Length = %d' % len(df))
    print('IsAnswerable Counts')
    print(df.is_answerable.value_counts() / len(df))
    print('IsAnswerableHuman Counts')
    print(df.is_answerable_human.value_counts()  / len(df))
    print()

In [35]:
test_df = df[df.expert]
train_df_part1 = df[~df.expert]

test_df_part1 = test_df[test_df.is_answerable_human.notnull()]
expert_rest_df = test_df[test_df.is_answerable_human.isnull()]

validate_df, test_df_part2, train_df_part2 = np.split(expert_rest_df.sample(frac=1), [200, 270])

train_df = pd.concat([train_df_part1, train_df_part2]).sample(frac=1)
test_df = pd.concat([test_df_part1, test_df_part2]).sample(frac=1)

print_stats(train_df, 'train')
print_stats(validate_df, 'val')
print_stats(test_df, 'test')

TRAIN
Length = 3517
IsAnswerable Counts
N    0.432187
Y    0.284049
S    0.283765
Name: is_answerable, dtype: float64
IsAnswerableHuman Counts
Series([], Name: is_answerable_human, dtype: float64)

VAL
Length = 200
IsAnswerable Counts
N    0.625
S    0.200
Y    0.175
Name: is_answerable, dtype: float64
IsAnswerableHuman Counts
Series([], Name: is_answerable_human, dtype: float64)

TEST
Length = 205
IsAnswerable Counts
N    0.531707
Y    0.292683
S    0.175610
Name: is_answerable, dtype: float64
IsAnswerableHuman Counts
N    0.278049
Y    0.258537
S    0.121951
Name: is_answerable_human, dtype: float64



In [36]:
def get_reviews(row):
    all_reviews = ''
    for key in ['review0', 'review1', 'review2', 'review3', 'review4']:
        if not isinstance(row[key], float):
            all_reviews += row[key].strip(' ').strip('-')
            all_reviews += ' '
    return all_reviews.strip()


def add_reviews(df):
    df['reviews'] = df.apply(lambda x: get_reviews(x), axis = 1)
    return df


train_df, test_df, validate_df = list(map(add_reviews, [train_df, test_df, validate_df]))

In [37]:
train_df.head()

Unnamed: 0,question,review0,review1,review2,review3,review4,is_answerable,is_answerable_human,expert,label,reviews
970,do you need an amp,The speakers are attractive; they look at leas...,"TRASH! They look nice, but I've received two p...","These speakers are not ""Rockers"", you would ne...",Very nice speakers that look cool and have a g...,This is the main reason I will be returning th...,S,,False,1,The speakers are attractive; they look at leas...
3222,can you hook two together?,"while i know this is meant for floor vents, i ...",i am using this in a different way.i bought a ...,We have a large chest of drawers sitting over ...,I didn't need to get this as it turned out but...,-,N,,False,0,"while i know this is meant for floor vents, i ..."
2487,What kind of receiver are you using with these...,I owned a pair of these speakers which worked ...,I got these as a birthday present and once ins...,...if you have thousands of dollars invested i...,The 151's were already at a moderate listening...,"You could 'feel' the sound everywhere.However,...",N,,False,0,I owned a pair of these speakers which worked ...
1552,can it works in the sony vaio t series,Corsair has a good name for computer accessori...,My laptop has an i3-2310M @ 2.10GHz with (2x2G...,-,-,-,N,,False,0,Corsair has a good name for computer accessori...
358,I want to make a cradle for a water barrel and...,I now have 110 gallons of long term stored eme...,I purchased this barrel for long term water st...,I live in a townhouse in Earthquake and Hurric...,I've recently been through two disasters and l...,-,N,,False,0,I now have 110 gallons of long term stored eme...


## TF IDF Vectorizer

In [9]:
"""
df = pd.read_csv('../../data/train-qar_sample_100000.csv')
df = add_reviews(df)
print(len(df))

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit(list(df.question.values) + list(df.reviews.values))

with open('../../data/tfidf_vectorizer.pkl', 'wb') as fp:
    pickle.dump(tfidf_vectorizer, fp)
"""

"\ndf = pd.read_csv('../../data/train-qar_sample_100000.csv')\ndf = add_reviews(df)\nprint(len(df))\n\ntfidf_vectorizer = TfidfVectorizer(stop_words='english')\ntfidf_vectorizer.fit(list(df.question.values) + list(df.reviews.values))\n\nwith open('../../data/tfidf_vectorizer.pkl', 'wb') as fp:\n    pickle.dump(tfidf_vectorizer, fp)\n"

In [10]:
with open('../../data/tfidf_vectorizer.pkl', 'rb') as fp:
    tfidf_vectorizer = pickle.load(fp)

In [11]:
len(tfidf_vectorizer.vocabulary_)

158752

## w2v Vectorizer

In [12]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = 300

    def fit(self, X):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [13]:
with open('../../data/w2v_vectorizer.pkl', 'rb') as fp:
    w2v_vectorizer = pickle.load(fp)

In [14]:
"""
import numpy as np

with open("../../data/glove.6B.300d.txt", "rb") as lines:
    w2v = {str(line.split()[0].decode('UTF-8')): np.array(list(map(float, line.split()[1:])))
           for line in lines}

w2v_vectorizer = MeanEmbeddingVectorizer(w2v)
with open('../../data/w2v_vectorizer.pkl', 'wb') as fp:
    pickle.dump(w2v_vectorizer, fp)
"""

'\nimport numpy as np\n\nwith open("../../data/glove.6B.300d.txt", "rb") as lines:\n    w2v = {str(line.split()[0].decode(\'UTF-8\')): np.array(list(map(float, line.split()[1:])))\n           for line in lines}\n\nw2v_vectorizer = MeanEmbeddingVectorizer(w2v)\nwith open(\'../../data/w2v_vectorizer.pkl\', \'wb\') as fp:\n    pickle.dump(w2v_vectorizer, fp)\n'

## Features

In [38]:
train_df_raw, test_df_raw, validate_df_raw = list(map(lambda x: x.copy(), [train_df, test_df, validate_df]))

In [39]:
def tokenize(text):
    punctuations = string.punctuation.replace("\'", '')

    for ch in punctuations:
        text = text.replace(ch, " " + ch + " ")

    tokens = text.split()
    for i, token in enumerate(tokens):
        if not token.isupper():
            tokens[i] = token.lower()
    return tokens

In [40]:
"""
x = vectorizer.transform(['It is a total idiot'])
y = vectorizer.transform(['It is not a total idiocy'])

print(type(x), '|\n', x.shape, '|\n', x, '|\n', len(x.toarray()[0]))
print("-"*50)
print(type(y), '|\n', y.shape, '|\n', y, '|\n', len(y.toarray()[0]))
print("-"*50)
print(x.toarray().dot(y.toarray().transpose())[0][0])
"""

'\nx = vectorizer.transform([\'It is a total idiot\'])\ny = vectorizer.transform([\'It is not a total idiocy\'])\n\nprint(type(x), \'|\n\', x.shape, \'|\n\', x, \'|\n\', len(x.toarray()[0]))\nprint("-"*50)\nprint(type(y), \'|\n\', y.shape, \'|\n\', y, \'|\n\', len(y.toarray()[0]))\nprint("-"*50)\nprint(x.toarray().dot(y.toarray().transpose())[0][0])\n'

In [41]:
def n_intersection(q, r):
    return len(set(q).intersection(set(r)))

def w2v_sim(q, r):
    # dot product of q and r as w2v vectors
    q_vec = w2v_vectorizer.transform([q])
    r_vec = w2v_vectorizer.transform([r])
    return q_vec.dot(r_vec.transpose())[0][0]

def tf_idf_sim(q, r):
    # dot product of q and r as tfidf vectors
    q_vec = tfidf_vectorizer.transform([q])
    r_vec = tfidf_vectorizer.transform([r])
    return q_vec.dot(r_vec.transpose()).toarray()[0][0]

def tf_idf_sim_sentence(q, rs):
    # max of dot products of q and each sentence in r as tfidf vectors
    q_vec = tfidf_vectorizer.transform([q])
    if len(rs) == 0:
        return 0
    return max([q_vec.dot(tfidf_vectorizer.transform([r]).transpose()).toarray()[0][0] for r in rs])

def w2v_sim_sentence(q, rs):
    # max of dot products of q and each sentence in r as tfidf vectors
    q_vec = w2v_vectorizer.transform([q])
    if len(rs) == 0:
        return 0
    return max([q_vec.dot(w2v_vectorizer.transform([r]).transpose())[0][0] for r in rs])

def tf_idf_sim_sentence_mean(q, rs):
    # max of dot products of q and each sentence in r as tfidf vectors
    q_vec = tfidf_vectorizer.transform([q])
    if len(rs) == 0:
        return 0
    return np.mean([q_vec.dot(tfidf_vectorizer.transform([r]).transpose()).toarray()[0][0] for r in rs])

def w2v_sim_sentence_mean(q, rs):
    # max of dot products of q and each sentence in r as tfidf vectors
    q_vec = w2v_vectorizer.transform([q])
    if len(rs) == 0:
        return 0
    return np.mean([q_vec.dot(w2v_vectorizer.transform([r]).transpose())[0][0] for r in rs])

def add_features(df):
    df['q_tokens'] = df.question.apply(lambda x: tokenize(x))
    df['r_tokens'] = df.reviews.apply(lambda x: tokenize(x))
    df['r_sents'] = df.reviews.apply(lambda x: sent_tokenize(x))
    df['n_q'] = df.q_tokens.apply(lambda x: len(x))
    df['n_r'] = df.r_tokens.apply(lambda x: len(x))
    df['n_intersection'] = df.apply(lambda x: len(set(x.q_tokens).intersection(set(x.r_tokens))), axis=1)
    df['intr_frac'] = df.n_intersection / df.n_q
    df['tfidf'] = df.apply(lambda x: tf_idf_sim(x.question, x.reviews), axis=1)
    df['w2v'] = df.apply(lambda x: w2v_sim(x.question, x.reviews), axis=1)
    df['w2v_sent'] = df.apply(lambda x: w2v_sim_sentence(x.question, x.r_sents), axis=1)
    df['tfidf_sent'] = df.apply(lambda x: tf_idf_sim_sentence(x.question, x.r_sents), axis=1)
    df['w2v_sent_mean'] = df.apply(lambda x: w2v_sim_sentence_mean(x.question, x.r_sents), axis=1)
    df['tfidf_sent_mean'] = df.apply(lambda x: tf_idf_sim_sentence_mean(x.question, x.r_sents), axis=1)
    return df


In [42]:
train_df, test_df, validate_df = list(map(add_features, [train_df_raw, test_df_raw, validate_df_raw]))

In [43]:
for typ, df in [('train', train_df), ('test', test_df), ('val', validate_df)]:
    pickle.dump(df, open('../../data/' + typ + '_df_features.pkl', 'wb'))

train_df_raw, test_df_raw, validate_df_raw = None, None, None

## Feature Cols

In [46]:
X_cols = ['n_q', 'n_r', 'n_intersection', 'intr_frac']
Y_cols = 'label'

## Analysis

In [47]:
df = train_df

q = df.question.iloc[1]
r = df.reviews.iloc[1]
q_tokens = df.q_tokens.iloc[1]
r_tokens = df.r_tokens.iloc[1]

# q, r, set(q_tokens).intersection(set(r_tokens))

In [48]:
df[[Y_cols] + X_cols].groupby(Y_cols).mean()

Unnamed: 0_level_0,n_q,n_r,n_intersection,intr_frac
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,16.272368,332.815789,7.380263,0.47284
1,17.484226,380.175263,9.612419,0.584565


## Training

In [59]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC, LinearSVC

extra_features = ['w2v', 'tfidf']
X_cols_all = X_cols + ['w2v_sent', 'w2v_sent_mean', 'tfidf_sent', 'tfidf_sent_mean'] + extra_features

models = []
models.append(LinearSVC(penalty='l1'))
models.append(LogisticRegression(C=100))
models.append(DecisionTreeClassifier(max_depth=4))
models.append(RandomForestClassifier(n_estimators=3, max_depth=4))

for i in range(len(models)):
    models[i].fit(train_df[X_cols_all].values, train_df[Y_cols].values)
    

ValueError: Unsupported set of arguments: The combination of penalty='l1' and loss='squared_hinge' are not supported when dual=True, Parameters: penalty='l1', loss='squared_hinge', dual=True

In [60]:
from sklearn.metrics import auc, roc_curve
from matplotlib import pyplot as plt
from sklearn.metrics import precision_recall_curve

def plot_curves(model):
    probs = model.predict_proba(validate_df[X_colss[2]].values)[:,1]
    ytrue = validate_df[Y_cols].values

    fpr, tpr, thresholds = roc_curve(ytrue, probs, pos_label=1)
    score = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % score)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

    precision, recall, thresholds = precision_recall_curve(ytrue, probs)
    print(precision, recall, thresholds)
    plt.figure()
    plt.plot(precision, recall, color='darkorange')
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    plt.grid()
    plt.show()

    """
    thresholds = [0.25, 0.5, 0.75]
    for th in thresholds:
        #print(probs)
        predictions = [1 if prob > th else 0 for prob in probs]
        #predictions = model.predict(validate_df[X_cols].values)
        print(pd.Series(predictions).value_counts())
        print(classification_report(ytrue, predictions))
    """
    
    
plot_curves(modelss[2][2])

NameError: name 'modelss' is not defined

In [61]:
from sklearn.metrics import classification_report

for i, model in enumerate(models):
    print("TRAIN")
    predictions = model.predict(train_df[X_cols_all].values)
    print(pd.Series(predictions).value_counts())
    print(classification_report(train_df[Y_cols].values, predictions))  
    print("VAL")
    predictions = model.predict(validate_df[X_cols_all].values)
    print(pd.Series(predictions).value_counts())
    print(classification_report(validate_df[Y_cols].values, predictions))    
    print("-"*50)

TRAIN


NotFittedError: This LinearSVC instance is not fitted yet

## Testing

In [34]:
from sklearn.metrics import classification_report

def predictions_k(model, X, k):
    probs = model.predict_proba(X)
    return probs[:, 1] >= k

for j, models in enumerate(modelss):
    for i, model in enumerate(models):
        print(j ,i)
        predictions = model.predict(test_df1[X_colss[j]].values)
        predictions = predictions_k(model, test_df1[X_colss[j]].values, 0.6)
        print(pd.Series(predictions).value_counts())
        print(classification_report(test_df1[Y_cols].values, predictions))

0 0
True     71
False    64
dtype: int64
             precision    recall  f1-score   support

          0       0.70      0.69      0.70        65
          1       0.72      0.73      0.72        70

avg / total       0.71      0.71      0.71       135

0 1
True     70
False    65
dtype: int64
             precision    recall  f1-score   support

          0       0.71      0.71      0.71        65
          1       0.73      0.73      0.73        70

avg / total       0.72      0.72      0.72       135

0 2
False    74
True     61
dtype: int64
             precision    recall  f1-score   support

          0       0.68      0.77      0.72        65
          1       0.75      0.66      0.70        70

avg / total       0.72      0.71      0.71       135

0 3
True     73
False    62
dtype: int64
             precision    recall  f1-score   support

          0       0.68      0.65      0.66        65
          1       0.68      0.71      0.70        70

avg / total       0.68      0.

## Human Performance

In [None]:
predictions = test_df_human.is_answerable_human.apply(lambda x: label_map[x]).values
labels = test_df_human.is_answerable.apply(lambda x: label_map[x]).values
print(classification_report(labels, predictions))

## Saving

In [None]:
#with open('../../data/model_answerability.pkl', 'wb') as fp:
#    pickle.dump(modelss[4][2], fp)

In [None]:
with open('../../data/model_answerability.pkl', 'rb') as fp:
    model = pickle.load(fp)