In [288]:
import numpy as np
import pandas as pd
import string

import sklearn

In [289]:
def tokenize(text):
    punctuations = string.punctuation.replace("\'", '')

    for ch in punctuations:
        text = text.replace(ch, " " + ch + " ")

    tokens = text.split()
    for i, token in enumerate(tokens):
        if not token.isupper():
            tokens[i] = token.lower()
    return tokens

In [290]:
df = pd.read_csv('~/Downloads/is_answerable_Electronics_samples_400_100_1 - is_answerable_Electronics_samples_400_100_1.tsv', sep='\t')
df = df[~ df.is_answerable.isnull()]

label_map = {'N': 0, 'S': 1, 'Y': 1}
df['label'] = df.is_answerable.apply(lambda x: label_map[x])

In [291]:
numberings = ['1)', '2)', '3)', '4)', '5)']
def remove_numberings(x):
    for i in numberings:
        x = x.replace(i, '')
    return x

In [292]:
def n_intersection(q, r):
    return len(set(q).intersection(set(r)))

df['q_tokens'] = df.question.apply(lambda x: tokenize(x))
df['r_tokens'] = df.reviews_q.apply(lambda x: tokenize(remove_numberings(x)))
df['n_q'] = df.q_tokens.apply(lambda x: len(x))
df['n_r'] = df.r_tokens.apply(lambda x: len(x))
df['n_intersection'] = df.apply(lambda x: len(set(x.q_tokens).intersection(set(x.r_tokens))), axis=1)
df['intr_frac'] = df.n_intersection / df.n_q

In [293]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
vectorizer.fit(list(df.question.values) + list(df.reviews_q.values))

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [294]:
def tf_idf_sim(q, r):
    q_vec = vectorizer.transform([q])
    r_vec = vectorizer.transform([r])
    return q_vec.dot(r_vec.transpose()).toarray()[0][0]

In [295]:
df['tfidf'] = df.apply(lambda x: tf_idf_sim(x.question, x.reviews_q), axis=1)

In [296]:
q = df.question.iloc[1]
r = remove_numberings(df.reviews_q.iloc[1])
set(q_tokens).intersection(set(r_tokens))

{'can', 'recorder', 'the', 'this'}

In [297]:
df[['label', 'n_q', 'n_r', 'n_intersection', 'intr_frac', 'tfidf']].groupby('label').mean()

Unnamed: 0_level_0,n_q,n_r,n_intersection,intr_frac,tfidf
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,22.668571,366.731429,9.942857,0.511709,0.14425
1,20.844262,394.385246,11.213115,0.600395,0.217068


In [323]:
df2 = df.sample(290)
train, test = df2.iloc[:190], df[190:]

In [316]:
X_cols = ['n_q', 'n_r', 'n_intersection', 'intr_frac', 'tfidf']
Y_cols = 'label'

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
model = LogisticRegression()
model = DecisionTreeClassifier()
model = RandomForestClassifier(n_estimators=5)
model.fit(train[X_cols].values, train[Y_cols].values)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [317]:
(model.predict(test[X_cols].values) == test[Y_cols].values).mean()

0.87850467289719625

In [324]:
from sklearn.metrics import classification_report
print(classification_report(test[Y_cols].values, model.predict(test[X_cols].values)))

              precision    recall  f1-score   support

           0       0.88      0.94      0.91        71
           1       0.87      0.75      0.81        36

   micro avg       0.88      0.88      0.88       107
   macro avg       0.88      0.85      0.86       107
weighted avg       0.88      0.88      0.88       107



In [322]:
sum(model.predict(test[X_cols].values) == 1)

31

In [326]:
import pickle
with open('q_classification_model.pkl', 'wb') as fp:
    pickle.dump(model, fp)

In [356]:
df3 = df.head()

In [359]:
for row_id, (i, row) in enumerate(df3.iterrows()):
    df3.at[i, 'question'] = 'a'

In [360]:
df3

Unnamed: 0,sl_no,id,Assignee,question,reviews_q,reviews_a1,reviews_a2,reviews_a3,is_answerable,Unnamed: 9,label,q_tokens,r_tokens,n_q,n_r,n_intersection,intr_frac,tfidf
99,99,"(536,3)",M,a,1) these are very nice monitors!! very clear v...,Answer: Standard US household current Reviews...,Answer: It plugs directly into the 120 wall ou...,Answer: It uses a power brick. The monitor req...,N,,0,"[what, is, power, source, ?, 6v, ,, 12v, ?, ?, ?]","[these, are, very, nice, monitors, !, !, very,...",11,413,2,0.181818,0.0
100,100,"(3498,4)",M,a,1) That's a good sized file. And recording a s...,Answer: I don't know what pyle mini speakers a...,"Answer: If the pyle speakers are powered,(have...",Answer: What for? This is the worst machine in...,N,,0,"[can, i, connect, pyle, mini, speakers, the, t...","[that's, a, good, sized, file, ., and, recordi...",10,464,4,0.4,0.058293
101,101,"(3971,6)",M,a,1) The difference between this and an ordinary...,Answer: Yes it will. Remember however that the...,Answer: Yes. Any EF mount camera. Reviews: 1)...,Answer: Yes. And trust me it's worth it! Revi...,S,,1,"[will, this, work, on, a, canon, EOS, 50D, ?]","[the, difference, between, this, and, an, ordi...",9,386,8,0.888889,0.17837
102,102,"(576,2)",M,a,1) For the money I don't think you can do bett...,Answer: No. It's a piece of crap. Don't buy R...,"Answer: I agree with William, I used it for 1 ...",Answer: Mine came with the wide angle lens but...,N,,0,"[does, the, camera, come, with, a, ref, filter...","[for, the, money, I, don't, think, you, can, d...",10,217,5,0.5,0.023271
103,103,"(3651,1)",M,a,1) First thing I would like to point out is ho...,Answer: Umbrella inserts below the tilt joint ...,Answer: Yes it does. But the screw that locks ...,Answer: It's pictured upside down. The umbrell...,Y,,1,"[does, the, umbrella, hole, come, in, from, th...","[first, thing, I, would, like, to, point, out,...",21,285,12,0.571429,0.115998
