In [231]:
import numpy as np
import pandas as pd
import string

import sklearn
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

## Data Loading and Splits

In [172]:
cols = ['question', 'review0', 'review1', 'review2', 'review3', 'review4', 'Is_Answerable']
df = pd.read_csv('classification_data/train.csv')[cols]

def filter_data(df):
    df = df[df.Is_Answerable.notnull()]
    df = df[df.review0.notnull()]
    return df

print(df.Is_Answerable.value_counts())
print('Total', len(df))

N    187
S     69
Y     63
A     10
Name: Is_Answerable, dtype: int64
Total 329


In [173]:
train_df, validate_df, test_df = np.split(df.sample(frac=1), [int(.6*len(df)), int(.8*len(df))])

In [174]:
train_df, test_df, validate_df = list(map(filter_data, [train_df, test_df, validate_df]))

In [175]:
len(train_df), len(validate_df), len(test_df)

(193, 66, 64)

In [271]:
def labels(df):
    df['label'] = df.Is_Answerable.apply(lambda x: label_map[x])
    return df['label']

def get_reviews(row):
    all_reviews = ''
    for key in ['review0', 'review1', 'review2', 'review3', 'review4']:
        if not isinstance(row[key], float):
            all_reviews += row[key].strip(' ')
            all_reviews += ' '
    return all_reviews.strip()

def add_reviews(df):
    df['reviews'] = df.apply(lambda x: get_reviews(x), axis = 1)
    return df

label_map = {'N': 0, 'S': 2, 'Y': 1, 'A': 1}
train_df, test_df, validate_df = list(map(add_reviews, [train_df, test_df, validate_df]))
train_labels, test_labels, validate_labels = list(map(labels, [train_df, test_df, validate_df]))

## TF IDF Vectorizer

In [272]:
with open('classification_data/q_classification_model.pkl', 'rb') as fp:
    vectorizer = pickle.load(fp)

In [273]:
len(vectorizer.vocabulary_)

150976

In [274]:
# df = pd.read_csv('samples/train-qar_sample_100000.csv')
# df = df[df.question.notnull()]
# df = df[df.review0.notnull()]
# df = add_reviews(df)
# print(len(df))

# vectorizer = TfidfVectorizer(stop_words='english')
# vectorizer.fit(list(df.question.values) + list(df.reviews.values))

In [275]:
# with open('classification_data/q_classification_model.pkl', 'wb') as fp:
#     pickle.dump(vectorizer, fp)

## Features

In [276]:
df = train_df

In [277]:
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer(stop_words='english')
# vectorizer.fit(\
#     list(train_df.question.values) + list(train_df.reviews.values) \
#     + list(validate_df.question.values) + list(validate_df.reviews.values))

In [278]:
def tokenize(text):
    punctuations = string.punctuation.replace("\'", '')

    for ch in punctuations:
        text = text.replace(ch, " " + ch + " ")

    tokens = text.split()
    for i, token in enumerate(tokens):
        if not token.isupper():
            tokens[i] = token.lower()
    return tokens

In [279]:
def n_intersection(q, r):
    return len(set(q).intersection(set(r)))

def tf_idf_sim(q, r):
    q_vec = vectorizer.transform([q])
    r_vec = vectorizer.transform([r])
    return q_vec.dot(r_vec.transpose()).toarray()[0][0]

def add_features(df):
    df['q_tokens'] = df.question.apply(lambda x: tokenize(x))
    df['r_tokens'] = df.reviews.apply(lambda x: tokenize(x))
    df['n_q'] = df.q_tokens.apply(lambda x: len(x))
    df['n_r'] = df.r_tokens.apply(lambda x: len(x))
    df['n_intersection'] = df.apply(lambda x: len(set(x.q_tokens).intersection(set(x.r_tokens))), axis=1)
    df['intr_frac'] = df.n_intersection / df.n_q
    df['tfidf'] = df.apply(lambda x: tf_idf_sim(x.question, x.reviews), axis=1)
    return df

In [280]:
train_df, test_df, validate_df = list(map(add_features, [train_df, test_df, validate_df]))

## Analysis

In [281]:
df = train_df

q = df.question.iloc[1]
r = df.reviews.iloc[1]
q_tokens = df.q_tokens.iloc[1]
r_tokens = df.r_tokens.iloc[1]

# q, r, set(q_tokens).intersection(set(r_tokens))

In [282]:
df[['label', 'n_q', 'n_r', 'n_intersection', 'intr_frac', 'tfidf']].groupby('label').mean()

Unnamed: 0_level_0,n_q,n_r,n_intersection,intr_frac,tfidf
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,14.699115,386.274336,7.566372,0.525509,0.149881
1,18.302326,409.465116,10.72093,0.615626,0.220657
2,15.756757,398.054054,9.486486,0.638918,0.259254


## Training

In [283]:
X_cols = ['n_q', 'n_r', 'n_intersection', 'intr_frac', 'tfidf']
Y_cols = 'label'

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# model = LogisticRegression(C=100)
# model = DecisionTreeClassifier()
model = RandomForestClassifier(n_estimators=5)
model.fit(train_df[X_cols].values, train_df[Y_cols].values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [284]:
(model.predict(validate_df[X_cols].values) == validate_df[Y_cols].values).mean()

0.48484848484848486

In [285]:
predictions = model.predict(validate_df[X_cols].values)
print(pd.Series(predictions).value_counts())
print(classification_report(validate_df[Y_cols].values, predictions))

0    39
1    17
2    10
dtype: int64
              precision    recall  f1-score   support

           0       0.56      0.71      0.63        31
           1       0.41      0.41      0.41        17
           2       0.30      0.17      0.21        18

   micro avg       0.48      0.48      0.48        66
   macro avg       0.43      0.43      0.42        66
weighted avg       0.45      0.48      0.46        66



## Testing

In [286]:
from sklearn.metrics import classification_report

predictions = model.predict(test_df[X_cols].values)
print(pd.Series(predictions).value_counts())
print(classification_report(test_df[Y_cols].values, predictions))

0    44
1    11
2     9
dtype: int64
              precision    recall  f1-score   support

           0       0.57      0.68      0.62        37
           1       0.18      0.15      0.17        13
           2       0.00      0.00      0.00        14

   micro avg       0.42      0.42      0.42        64
   macro avg       0.25      0.28      0.26        64
weighted avg       0.37      0.42      0.39        64



## Saving

In [189]:
import pickle
# with open('q_classification_model_2.pkl', 'wb') as fp:
#     pickle.dump(model, fp)

In [191]:
# with open('classification_data/q_classification_model.pkl', 'rb') as fp:
#     model = pickle.load(fp)

TypeError: a bytes-like object is required, not 'str'