In [55]:
import numpy as np
import pandas as pd
import string
import pickle

import sklearn
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import sent_tokenize

## Data Loading and Splits

In [56]:
filename = '/Users/mansi/capstone/annotations/all_annotated/final_annotated_data.csv'
#cols = ['question', 'review0', 'review1', 'review2', 'review3', 'review4', 'is_answerable_orig', "expert"]
#df = pd.read_csv('classification_data/train.csv')[cols]
df = pd.read_csv(filename)

print(df.is_answerable_orig.value_counts())
print('Total', len(df))

N    1574
S    1034
Y    1020
Name: is_answerable_orig, dtype: int64
Total 3629


In [57]:
#df = pd.read_csv('classification_data/final_annotated_data.csv')

def print_stats(df, name='df'):
    print(name.upper())
    print('Length = %d' % len(df))
    print('IsAnswerableOrig Counts')
    print(df.is_answerable_orig.value_counts())
    print('IsAnswerableNew Counts')
    print(df.is_answerable_new.value_counts())
    print()

In [58]:
train_df = df[~df.expert]
test_df = df[df.expert]
test_df = test_df[test_df.is_answerable_new.notnull()]

print_stats(train_df, 'train')
print_stats(test_df, 'test')

TRAIN
Length = 3348
IsAnswerableOrig Counts
N    1436
S     958
Y     954
Name: is_answerable_orig, dtype: int64
IsAnswerableNew Counts
Series([], Name: is_answerable_new, dtype: int64)

TEST
Length = 281
IsAnswerableOrig Counts
N    138
S     76
Y     66
Name: is_answerable_orig, dtype: int64
IsAnswerableNew Counts
N    134
S     77
Y     70
Name: is_answerable_new, dtype: int64



In [59]:
# Column Renames
cols = ['question', 'review0', 'review1', 'review2', 'review3', 'review4', 'is_answerable']

train_df = train_df.rename(columns={'is_answerable_orig': 'is_answerable'})[cols]

test_df = test_df.rename(columns={'is_answerable_orig': 'is_answerable', 
                                  'is_answerable_new': 'is_answerable_human'})[cols + ['is_answerable_human']]


def filter_data(df):
    print(len(df))
    df = df[df.is_answerable.notnull()]
    print(len(df))
    df = df[df.review0.notnull()]
    print(len(df))
    return df

train_df = filter_data(train_df)

print('\n')

test_df = filter_data(test_df)
test_df_human = test_df[test_df.is_answerable_human.notnull()]
print(len(test_df))

3348
3348
3303


281
280
134
134


In [60]:
train_df, validate_df = np.split(train_df.sample(frac=1), [int(.8*len(train_df))])

In [61]:
len(train_df), len(validate_df), len(test_df)

(2642, 661, 134)

In [62]:
def labels(df):
    df['label'] = df.is_answerable.apply(lambda x: label_map[x])
    return df['label']

def get_reviews(row):
    all_reviews = ''
    for key in ['review0', 'review1', 'review2', 'review3', 'review4']:
        if not isinstance(row[key], float):
            all_reviews += row[key].strip(' ').strip('-')
            all_reviews += ' '
    return all_reviews.strip()

def add_reviews(df):
    df['reviews'] = df.apply(lambda x: get_reviews(x), axis = 1)
    return df

label_map = {'N': 0, 'S': 1, 'Y': 1, 'A': 1}
for i, j in zip([1, 2, 3], [1, 1, 0]):
    label_map[i] = j
    label_map[str(i)] = j

train_df, test_df, validate_df = list(map(add_reviews, [train_df, test_df, validate_df]))
train_labels, test_labels, validate_labels = list(map(labels, [train_df, test_df, validate_df]))

## TF IDF Vectorizer

In [66]:
"""
df = pd.read_csv('classification_data/train-qar_sample_100000.csv')
df = add_reviews(df)
print(len(df))

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_vectorizer.fit(list(df.question.values) + list(df.reviews.values))

with open('classification_data/q_classification_vectorizer.pkl', 'wb') as fp:
    pickle.dump(tfidf_vectorizer, fp)
"""

  interactivity=interactivity, compiler=compiler, result=result)


100002


In [67]:
with open('classification_data/q_classification_vectorizer.pkl', 'rb') as fp:
    tfidf_vectorizer = pickle.load(fp)

In [68]:
len(vectorizer.vocabulary_)

158752

## w2v Vectorizer

In [245]:
with open('classification_data/q_classification_vectorizer.pkl', 'rb') as fp:
    vectorizer = pickle.load(fp)

In [246]:
len(vectorizer.vocabulary_)

158752

In [None]:
import numpy as np

with open("classification_data/glove.840B.300d.txt", "rb") as lines:
    w2v = {str(line.split()[0].decode('UTF-8')): np.array(list(map(float, line.split()[1:])))
           for line in lines}

In [None]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = 50

    def fit(self, X):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [None]:
df = pd.read_csv('classification_data/train-qar_sample_100000.csv')
df = df[df.question.notnull()]
df = df[df.review0.notnull()]
df = add_reviews(df)
print(len(df))

In [None]:
vectorizer = MeanEmbeddingVectorizer(w2v)
vectorizer.fit(list(df.question.values) + list(df.reviews.values))

In [None]:
# with open('classification_data/q_classification_w2v_vectorizer.pkl', 'wb') as fp:
#     pickle.dump(vectorizer, fp)

## Features

In [None]:
df = train_df

In [None]:
train_df_raw, test_df_raw, validate_df_raw = list(map(lambda x: x.copy(), [train_df, test_df, validate_df]))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(\
     list(train_df.question.values) + list(train_df.reviews.values) \
     + list(validate_df.question.values) + list(validate_df.reviews.values))

In [None]:
def tokenize(text):
    punctuations = string.punctuation.replace("\'", '')

    for ch in punctuations:
        text = text.replace(ch, " " + ch + " ")

    tokens = text.split()
    for i, token in enumerate(tokens):
        if not token.isupper():
            tokens[i] = token.lower()
    return tokens

In [None]:
x = vectorizer.transform(['It is a total idiot'])
y = vectorizer.transform(['It is not a total idiocy'])

print(type(x), '|\n', x.shape, '|\n', x, '|\n', len(x.toarray()[0]))
print("-"*50)
print(type(y), '|\n', y.shape, '|\n', y, '|\n', len(y.toarray()[0]))
print("-"*50)
print(x.toarray().dot(y.toarray().transpose())[0][0])

In [None]:
def n_intersection(q, r):
    return len(set(q).intersection(set(r)))

def tf_idf_sim(q, r):
    # dot product of q and r as tfidf vectors
    q_vec = vectorizer.transform([q])
    r_vec = vectorizer.transform([r])
#     return q_vec.dot(r_vec.transpose()).toarray()[0][0]
    return q_vec.toarray().dot(r_vec.toarray().transpose())[0][0]

def tf_idf_sim_sentence(q, r):
    # max of dot products of q and each sentence in r as tfidf vectors
    q_vec = vectorizer.transform([q])
    rs = sent_tokenize(r)
    if len(rs) == 0:
        return 0
    return max([q_vec.dot(vectorizer.transform([r]).transpose()).toarray()[0][0] for r in rs])

def add_features(df):
    df['q_tokens'] = df.question.apply(lambda x: tokenize(x))
    df['r_tokens'] = df.reviews.apply(lambda x: tokenize(x))
    df['n_q'] = df.q_tokens.apply(lambda x: len(x))
    df['n_r'] = df.r_tokens.apply(lambda x: len(x))
    df['n_intersection'] = df.apply(lambda x: len(set(x.q_tokens).intersection(set(x.r_tokens))), axis=1)
    df['intr_frac'] = df.n_intersection / df.n_q
    df['tfidf'] = df.apply(lambda x: tf_idf_sim(x.question, x.reviews), axis=1)
#     df['tfidf_sent'] = df.apply(lambda x: tf_idf_sim_sentence(x.question, x.reviews), axis=1)
    return df

In [None]:
train_df, test_df, validate_df = list(map(add_features, [train_df_raw, test_df_raw, validate_df_raw]))

## Analysis

In [None]:
df = train_df

q = df.question.iloc[1]
r = df.reviews.iloc[1]
q_tokens = df.q_tokens.iloc[1]
r_tokens = df.r_tokens.iloc[1]

# q, r, set(q_tokens).intersection(set(r_tokens))

In [None]:
df[['label', 'n_q', 'n_r', 'n_intersection', 'intr_frac', 'tfidf']].groupby('label').mean()

## Training

In [None]:
X_cols = ['n_q', 'n_r', 'n_intersection', 'intr_frac', 'tfidf']
Y_cols = 'label'

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

models = []
models.append(LogisticRegression(C=1))
models.append(LogisticRegression(C=100))
models.append(DecisionTreeClassifier(max_depth=4))
models.append(RandomForestClassifier(n_estimators=3, max_depth=4))

for i in range(len(models)):
    models[i].fit(train_df[X_cols].values, train_df[Y_cols].values)

In [None]:
print((train_df[X_cols].values[0]), "|||" , (train_df[Y_cols].values[0]))

In [None]:
for i in range(len(models)):
    print((models[i].predict(train_df[X_cols].values) == train_df[Y_cols].values).mean())    
    print((models[i].predict(validate_df[X_cols].values) == validate_df[Y_cols].values).mean())
    #print(models[i].predict_proba(validate_df[X_cols].values))
    print()

In [None]:
from sklearn.metrics import auc, roc_curve
from matplotlib import pyplot as plt
from sklearn.metrics import precision_recall_curve
import scikitplot as skplt

def plot_curves(model):
    probs = model.predict_proba(validate_df[X_cols].values)[:,1]
    ytrue = validate_df[Y_cols].values

    fpr, tpr, thresholds = roc_curve(ytrue, probs, pos_label=1)
    score = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.2f)' % score)
    plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

    precision, recall, thresholds = precision_recall_curve(ytrue, probs)
    #print(precision, recall, thresholds)
    plt.figure()
    plt.plot(precision, recall, color='darkorange')
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    #plt.xlim([0.0, 1.0])
    #plt.ylim([0.0, 1.05])
    plt.grid()
    plt.show()

    """
    thresholds = [0.25, 0.5, 0.75]
    for th in thresholds:
        #print(probs)
        predictions = [1 if prob > th else 0 for prob in probs]
        #predictions = model.predict(validate_df[X_cols].values)
        print(pd.Series(predictions).value_counts())
        print(classification_report(ytrue, predictions))
    """
    
    

for i, model in enumerate(models):
    print(i)
    plot_curves(model)

## Testing

In [191]:
from sklearn.metrics import classification_report

predictions = model.predict(test_df[X_cols].values)
print(pd.Series(predictions).value_counts())
print(classification_report(test_df[Y_cols].values, predictions))

1    98
0    36
dtype: int64
             precision    recall  f1-score   support

          0       0.78      0.43      0.55        65
          1       0.62      0.88      0.73        69

avg / total       0.70      0.66      0.65       134



## Human Performance

In [192]:
predictions = test_df_human.is_answerable_human.apply(lambda x: label_map[x]).values
labels = test_df_human.is_answerable.apply(lambda x: label_map[x]).values
print(classification_report(labels, predictions))

             precision    recall  f1-score   support

          0       0.91      0.80      0.85        65
          1       0.83      0.93      0.88        69

avg / total       0.87      0.87      0.86       134



## Saving

In [None]:
# with open('q_classification_model_2.pkl', 'wb') as fp:
#     pickle.dump(model, fp)

In [None]:
# with open('classification_data/q_classification_model.pkl', 'rb') as fp:
#     model = pickle.load(fp)