In [3]:
import pandas as pd

In [4]:
def jaccard_similarity(query, document):
    query, document = query.split(), document.split()
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

In [10]:
def get_most_similar_review(row, raw_source):
    prod = row.prod_corpus
    if (isinstance(prod, float)):
        return None
    cust_reviews = raw_source.query('customer_id == {}'.format(row.customer_id))
    max_sim, max_row = 0, None
    for i, review in cust_reviews.iterrows():
        if (not review.product_parent in product_df.index):
            continue
        cur_prod = product_df.loc[review.product_parent].corpus
        if (isinstance(cur_prod, float)):
            continue
        sim = jaccard_similarity(prod, cur_prod)
        if (sim > max_sim):
            max_sim, max_row = sim, review
            
    return max_row

In [31]:
from sklearn.neighbors import NearestNeighbors
from sklearn import preprocessing
def get_most_similar_product(row, raw_source):
    if (not row.product_parent in product_df.index):
        return None
    prod = product_df.loc[row.product_parent]
    cust_reviews = raw_source.query('customer_id == {}'.format(row.customer_id))
    products = product_df[product_df.index.isin(cust_reviews.product_parent.tolist())]
    if (len(products) == 0):
        return None
    min_max_scaler = preprocessing.MinMaxScaler()
    nbrs = NearestNeighbors(n_neighbors=1, algorithm='ball_tree').fit(min_max_scaler.fit_transform(products.drop('corpus', axis=1)))
    return products.iloc[nbrs.kneighbors(min_max_scaler.transform([prod.drop('corpus')]), return_distance=False)[0][0]]

In [20]:
product_df = pd.read_csv('product_df.csv', index_col=0)
product_df = product_df[2:]
product_df.drop(['total_votes', 'total_votes.1', 'helpful_votes', 'helpful_votes.1'], axis=1, inplace=True)
product_df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,review_id,star_rating,star_rating.1,corpus
1602,3,3.6666666666666665,1.5275252316519468,keyboard month featur problems. < br / > < br ...
2696,3,3.6666666666666665,1.5275252316519468,one pad roccat cu surfac metal foil. < br / > ...
3140,1,1.0,0.0,peel
4318,23,4.086956521739131,1.202763616096502,dimens purs i ipad mini im sick god tablet i t...
10581,7,5.0,0.0,did extens cabl bulk cabl length odd patch cab...


In [21]:
test_raw = pd.read_csv('test_raw.csv', index_col=0)
test_df = pd.read_csv('test_df.csv', index_col=0)

In [97]:
correct = 0
bad_data = 0
positive = 0
for i, row in test_df.iterrows():
    pred = get_most_similar_review(row, test_raw)
    if (pred is None):
        bad_data += 1
        continue
    if (pred.star_rating > 3):
        positive += 1
    if (pred.star_rating > 3 and row.star_rating > 3):
        correct += 1
    if (pred.star_rating <= 3 and row.star_rating <= 3):
        correct += 1

print('Correct:', correct, correct / len(test_df))
print('Bad:', bad_data, bad_data / len(test_df))
print('Positive:', positive, positive / len(test_df))

Correct: 16565 0.7194353963083605
Bad: 57 0.00247557003257329
Positive: 18185 0.7897937024972855


In [121]:
correct = 0
bad_data = 0
positive = 0
for i, row in test_df.iterrows():
    pred = get_most_similar_product(row, test_raw)
    if (pred is None):
        bad_data += 1
        continue
    if (pred.star_rating > 3):
        positive += 1
    if (pred.star_rating > 3 and row.star_rating > 3):
        correct += 1
    if (pred.star_rating <= 3 and row.star_rating <= 3):
        correct += 1

print('Correct:', correct, correct / len(test_df))
print('Bad:', bad_data, bad_data / len(test_df))
print('Positive:', positive, positive / len(test_df))

Correct: 12026 0.5223018458197611
Bad: 7204 0.3128773072747014
Positive: 15010 0.651900108577633


In [22]:
def get_pred(row, raw_source):
    knn_pred = get_most_similar_product(row, raw_source)
    jaccard_pred = get_most_similar_review(row, raw_source)
    if (knn_pred is None and jaccard_pred is None):
        return None, None
    if (knn_pred is None):
        return jaccard_pred.star_rating, None
    if (jaccard_pred is None):
        return None, knn_pred.star_rating
    
    return jaccard_pred.star_rating, knn_pred.star_rating

In [130]:
correct = 0
bad_data = 0
positive = 0
for i, row in test_df.iterrows():
    pred = get_pred(row, test_raw)
    if (pred is None):
        bad_data += 1
        continue
    if (pred > 3):
        positive += 1
    if (pred > 3 and row.star_rating > 3):
        correct += 1
    if (pred <= 3 and row.star_rating <= 3):
        correct += 1

print('Correct:', correct, correct / len(test_df))
print('Bad:', bad_data, bad_data / len(test_df))
print('Positive:', positive, positive / len(test_df))

Correct: 17212 0.7475352877307274
Bad: 36 0.0015635179153094462
Positive: 19899 0.8642345276872965


In [23]:
train_raw = pd.read_csv('train_raw.csv', index_col=0)
train_df = pd.read_csv('train_df.csv', index_col=0)

In [32]:
res = []
for i, row in train_df.iterrows():
    pred = get_pred(row, train_raw)
    res.append((pred[0], pred[1], row.star_rating))

In [33]:
def get_weighted(jacc, knn, alpha):
    if (jacc is None and knn is None):
        return None
    if (jacc is None):
        return knn
    if (knn is None):
        return jacc
    
    return alpha * jacc + (1 - alpha) * knn

In [34]:
accs = []
best_alpha = 0
best_acc = 0
for alpha_big in range(101):
    alpha = alpha_big / 100
    correct = 0
    positive = 0
    bad_data = 0
    for jacc, knn, truth in res:
        pred = get_weighted(jacc, knn, alpha)
        #print(jacc, knn, correct, pred)
        if (pred is None):
            bad_data += 1
            continue
        if (pred > 3):
            positive += 1
        if (pred > 3 and truth > 3):
            correct += 1
        if (pred <= 3 and truth <= 3):
            correct += 1
    '''print('---', alpha, '---')
    print('Correct:', correct, correct / len(train_df))
    print('Bad:', bad_data, bad_data / len(train_df))
    print('Positive:', positive, positive / len(train_df))'''
    accs.append(correct / len(train_df))
    if (correct / len(train_df) > best_acc):
        best_acc, best_alpha = (correct / len(train_df)), alpha

print('Best Results at Alpha: {} Acc: {}'.format(best_alpha, best_acc))        

Best Results at Alpha: 0.22 Acc: 0.779257132083134


In [35]:
test_res = []
for i, row in test_df.iterrows():
    pred = get_pred(row, test_raw)
    test_res.append((pred[0], pred[1], row.star_rating))

In [39]:
threshhold = 3
test_accs = []
test_best_alpha = 0
test_best_acc = 0
test_best_pos = 0
test_best_correct_pos = 0
test_best_correct_neg = 0
test_best_false_pos = 0
test_best_false_neg = 0
for alpha_big in range(101):
    alpha = alpha_big / 100
    correct = 0
    positive = 0
    bad_data = 0
    correct_pos = 0
    correct_neg = 0
    false_neg = 0
    false_pos = 0
    for jacc, knn, truth in test_res:
        pred = get_weighted(jacc, knn, alpha)
        #print(jacc, knn, correct, pred)
        if (pred is None):
            bad_data += 1
            continue
        if (pred > threshhold):
            positive += 1
        if (pred > threshhold and truth > threshhold):
            correct += 1
            correct_pos += 1
        if (pred <= threshhold and truth <= threshhold):
            correct += 1
            correct_neg += 1 
        if (pred <= threshhold and truth > threshhold):
            false_neg += 1 
        if (pred > threshhold and truth <= threshhold):
            false_pos += 1 
    '''print('---', alpha, '---')
    print('Correct:', correct, correct / len(train_df))
    print('Bad:', bad_data, bad_data / len(train_df))
    print('Positive:', positive, positive / len(train_df))'''
    test_accs.append(correct / len(test_df))
    if (correct / len(test_df) > test_best_acc):
    #if (correct_neg > test_best_correct_neg):
        test_best_acc, test_best_alpha, test_best_pos = (correct / len(test_df)), alpha, (positive / len(test_df))
        test_best_correct_neg = correct_neg
        test_best_correct_pos = correct_pos
        test_best_false_pos = false_pos
        test_best_false_neg = false_neg

print('Best Results at Alpha: {} Acc: {} Pos: {}'.format(test_best_alpha, test_best_acc, test_best_pos))
print('Correct Neg:', test_best_correct_neg / len(test_df))# len(test_df[test_df.star_rating <= threshhold]))
print('Correct Pos:', test_best_correct_pos / len(test_df))#len(test_df[test_df.star_rating > threshhold]))
print('False Neg:', test_best_false_neg / len(test_df))#len(test_df[test_df.star_rating <= threshhold])))
print('False Pos:', test_best_false_pos / len(test_df))#len(test_df[test_df.star_rating > threshhold])))

Best Results at Alpha: 0.32 Acc: 0.7565689467969599 Pos: 0.9046254071661238
Correct Neg: 0.04030401737242128
Correct Pos: 0.7162649294245386
False Neg: 0.053507057546145494
False Pos: 0.18836047774158524


In [147]:
max(accs)

0.775109138638704