In [1]:
import sys
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [2]:
# run undersampled logistic on subset B to check predictions
# read/prep data
dat = pd.read_csv("data/tokenized_reviews.csv")
dat = dat.dropna()
dat["quote"] = dat["quote"].astype(int)
dat["tokenized_words"] = dat["tokenized_words"].apply(lambda x: x.strip("[']").replace("', '"," "))

# 85% train / 15% test
X_train, X_test, y_train, y_test = train_test_split(dat.drop(columns=["popular"]), 
                                                    dat["popular"],
                                                    test_size = 0.15,
                                                    random_state = 229)

# feature subsets
subset_b = ["user_reviews","days_since_review","user_rating","rating_diff",
            "num_words","avg_word_len","avg_sent_len","pct_verbs",
            "pct_nouns","pct_adj","quote","sentiment"]

# undersample train set
majority_size = len(y_train[y_train==0])
minority_size = len(y_train[y_train==1])
majority_indices = y_train[y_train==0].index
rng = np.random.default_rng(seed=229)
drop_indices = rng.choice(majority_indices, majority_size-minority_size, replace=False)
X_train = X_train.drop(drop_indices)
y_train = y_train.drop(drop_indices)

log_reg = sm.Logit(y_train, sm.add_constant(X_train[subset_b])).fit()
predictions = log_reg.predict(sm.add_constant(X_test[subset_b]))
predictions.rename("predictions", inplace=True)

Optimization terminated successfully.
         Current function value: 0.626418
         Iterations 6


1148226    0.345244
1898195    0.235176
1287911    0.382331
941466     0.715816
205890     0.597087
             ...   
1965816    0.439618
1848117    0.290346
1861785    0.541369
423898     0.331198
1063051    0.428067
Name: predictions, Length: 297156, dtype: float64

In [3]:
# merge test dataset with full set of features and predictions
raw = pd.read_csv("data/filtered_reviews.csv")
dat = X_test.merge(raw,left_index=True,right_index=True)
dat = dat.drop(columns=["user_reviews_x","days_since_review_x","user_rating_x"])
dat = dat.merge(predictions,left_index=True,right_index=True)

In [4]:
# error classification
conditions = [
    (dat["popular"]==0) & (dat["predictions"]<0.5),
    (dat["popular"]==0) & (dat["predictions"]>=0.5),
    (dat["popular"]==1) & (dat["predictions"]>=0.5),
    (dat["popular"]==1) & (dat["predictions"]<0.5)
]
values = ["TN","FP","TP","FN"]

dat["error"] = np.select(conditions, values)

In [5]:
# is prediction probability correlated with like share
dat[["like_share","predictions"]].corr()

Unnamed: 0,like_share,predictions
like_share,1.0,0.29066
predictions,0.29066,1.0


In [6]:
dat[dat["error"]=="FP"].describe()

Unnamed: 0,rating_diff,num_words,avg_word_len,avg_sent_len,pct_verbs,pct_nouns,pct_adj,quote,sentiment,book_id,user_reviews_y,user_rating_y,avg_rating,ratings_count,days_since_review_y,review_likes,like_share,popular,predictions
count,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0,60103.0
mean,-0.080861,381.576261,4.335578,17.950723,0.190978,0.235747,0.164877,0.474302,0.161806,16307310.0,494.536146,3.820408,3.901269,79609.56,1210.078249,4.16718,0.006466,0.0,0.630628
std,1.142228,302.091402,2.137312,9.499082,0.036652,0.05951,0.04045,0.499343,0.193815,9989225.0,564.368449,1.185833,0.301846,293442.2,798.71674,8.855249,0.00539,0.0,0.116809
min,-4.56,1.0,2.5,0.5,0.0,0.0,0.0,0.0,-0.9809,1.0,1.0,0.0,1.25,11.0,8.0,1.0,3e-05,0.0,0.500001
25%,-0.69,162.0,4.12931,13.718129,0.173684,0.207813,0.14403,0.0,0.043612,8542996.0,138.0,3.0,3.73,2457.0,561.0,1.0,0.001783,0.0,0.539872
50%,0.11,327.0,4.290566,17.0,0.193118,0.228989,0.163478,0.0,0.153813,17347320.0,338.0,4.0,3.92,9374.0,1058.0,2.0,0.004942,0.0,0.594784
75%,0.81,525.0,4.475972,20.772727,0.211268,0.253575,0.183761,1.0,0.271583,24423630.0,636.0,5.0,4.11,39957.0,1739.0,4.0,0.010309,0.0,0.687902
max,2.92,3634.0,452.0,637.0,1.0,1.0,1.0,1.0,0.9995,36307240.0,7938.0,5.0,4.91,4899965.0,3883.0,317.0,0.02,0.0,1.0


In [7]:
dat[dat["error"]=="TP"].describe()

Unnamed: 0,rating_diff,num_words,avg_word_len,avg_sent_len,pct_verbs,pct_nouns,pct_adj,quote,sentiment,book_id,user_reviews_y,user_rating_y,avg_rating,ratings_count,days_since_review_y,review_likes,like_share,popular,predictions
count,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0,34939.0
mean,-0.078254,405.555397,4.32469,17.801026,0.190426,0.235433,0.165279,0.48433,0.158231,17263090.0,694.44343,3.818856,3.89711,10310.54,1188.701823,26.645811,0.125533,1.0,0.688236
std,1.1749,318.958835,0.28932,7.905548,0.03259,0.044196,0.036566,0.499762,0.187244,10758350.0,638.935198,1.223687,0.302119,53768.42,848.572555,53.948399,0.170903,0.0,0.135583
min,-4.75,1.0,2.25,0.5,0.0,0.0,0.0,0.0,-0.9966,1.0,1.0,0.0,1.25,11.0,7.0,2.0,0.020013,1.0,0.500007
25%,-0.68,190.0,4.140984,13.571429,0.172881,0.210232,0.144848,0.0,0.041806,8176075.0,257.0,3.0,3.72,581.0,495.0,5.0,0.032258,1.0,0.570976
50%,0.12,340.0,4.302789,16.869565,0.191429,0.231263,0.163462,0.0,0.148593,18169280.0,513.0,4.0,3.92,1886.0,999.0,11.0,0.057851,1.0,0.664197
75%,0.83,536.0,4.4863,20.72318,0.209113,0.254723,0.183007,1.0,0.268092,25817030.0,920.0,5.0,4.11,6165.0,1727.0,29.0,0.131649,1.0,0.787656
max,2.7,3524.0,10.5,460.5,0.666667,1.0,1.0,1.0,0.9948,36439110.0,7938.0,5.0,5.0,2078406.0,3882.0,3394.0,1.0,1.0,0.99999


In [8]:
dat[dat["error"]=="TN"].describe()

Unnamed: 0,rating_diff,num_words,avg_word_len,avg_sent_len,pct_verbs,pct_nouns,pct_adj,quote,sentiment,book_id,user_reviews_y,user_rating_y,avg_rating,ratings_count,days_since_review_y,review_likes,like_share,popular,predictions
count,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0,175840.0
mean,-0.057348,108.215343,4.213158,14.395314,0.201256,0.227458,0.185647,0.154322,0.231574,13863990.0,91.929726,3.989018,4.046367,173395.2,1246.874852,2.508587,0.004352,0.0,0.368765
std,1.182021,103.928522,0.445662,7.239941,0.060151,0.064704,0.075612,0.361258,0.255378,10230250.0,107.871652,1.218944,0.259406,492648.6,849.582512,5.158972,0.004841,0.0,0.069792
min,-5.0,1.0,1.0,0.5,0.0,0.0,0.0,0.0,-0.9882,1.0,1.0,0.0,2.46,3.0,0.0,1.0,3e-05,0.0,0.112211
25%,-0.53,33.0,3.963636,10.0,0.170588,0.193857,0.145161,0.0,0.066729,3407877.0,16.0,3.0,3.88,4953.0,575.0,1.0,0.000617,0.0,0.31774
50%,0.17,73.0,4.18797,13.6,0.2,0.224138,0.177083,0.0,0.228454,13638570.0,51.0,4.0,4.06,23870.0,1061.0,1.0,0.002227,0.0,0.368828
75%,0.82,151.0,4.431373,17.75,0.229008,0.257143,0.214286,0.0,0.393789,22504500.0,128.0,5.0,4.23,110050.0,1775.0,2.0,0.006711,0.0,0.423173
max,2.19,818.0,12.0,208.0,1.0,1.0,1.0,1.0,0.998,36413250.0,952.0,5.0,5.0,4899965.0,4951.0,483.0,0.02,0.0,0.5


In [9]:
# load predictions from tfidf random forest
with open("data/rf_predictions.pkl", "rb") as fp:
    predictions = pickle.load(fp)

In [10]:
# merge test dataset with full set of features and predictions
raw = pd.read_csv("data/filtered_reviews.csv")
dat = X_test.merge(raw,left_index=True,right_index=True)
dat = dat.drop(columns=["user_reviews_x","days_since_review_x","user_rating_x"])
dat["predictions"] = predictions

In [11]:
# error classification
conditions = [
    (dat["popular"]==0) & (dat["predictions"]<0.5),
    (dat["popular"]==0) & (dat["predictions"]>=0.5),
    (dat["popular"]==1) & (dat["predictions"]>=0.5),
    (dat["popular"]==1) & (dat["predictions"]<0.5)
]
values = ["TN","FP","TP","FN"]

dat["error"] = np.select(conditions, values)

In [12]:
# is prediction probability correlated with like share
dat[["like_share","predictions"]].corr()

Unnamed: 0,like_share,predictions
like_share,1.0,0.333259
predictions,0.333259,1.0


In [13]:
dat[dat["error"]=="FP"].describe()

Unnamed: 0,rating_diff,num_words,avg_word_len,avg_sent_len,pct_verbs,pct_nouns,pct_adj,quote,sentiment,book_id,user_reviews_y,user_rating_y,avg_rating,ratings_count,days_since_review_y,review_likes,like_share,popular,predictions
count,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0,67596.0
mean,-0.089392,290.684612,4.296775,17.059276,0.192704,0.234134,0.169701,0.365184,0.180965,16398280.0,441.398174,3.856959,3.946351,58681.99,1216.049574,3.7683,0.007144,0.0,0.648213
std,1.286425,258.87773,0.328094,8.361619,0.038748,0.047273,0.047324,0.481485,0.206732,10676990.0,520.000652,1.312382,0.284872,232633.2,871.657022,7.563083,0.005367,0.0,0.105293
min,-5.0,1.0,1.0,0.5,0.0,0.0,0.0,0.0,-0.9771,1.0,1.0,0.0,1.25,3.0,0.0,1.0,3.1e-05,0.0,0.500002
25%,-0.68,115.0,4.10171,12.833333,0.172043,0.207965,0.144219,0.0,0.050414,7514925.0,133.0,3.0,3.77,1953.75,496.0,1.0,0.002443,0.0,0.559228
50%,0.17,226.0,4.28022,16.173913,0.192953,0.230769,0.165714,0.0,0.171967,17568800.0,287.0,4.0,3.96,7209.0,1029.0,2.0,0.005952,0.0,0.630509
75%,0.88,391.0,4.474467,20.076923,0.21327,0.25641,0.18913,1.0,0.304285,25372150.0,548.0,5.0,4.14,27824.0,1770.0,4.0,0.011173,0.0,0.720883
max,2.92,3580.0,12.0,637.0,1.0,1.0,1.0,1.0,0.9995,36340570.0,7938.0,5.0,5.0,4899965.0,3988.0,303.0,0.02,0.0,0.990272


In [14]:
dat[dat["error"]=="TP"].describe()

Unnamed: 0,rating_diff,num_words,avg_word_len,avg_sent_len,pct_verbs,pct_nouns,pct_adj,quote,sentiment,book_id,user_reviews_y,user_rating_y,avg_rating,ratings_count,days_since_review_y,review_likes,like_share,popular,predictions
count,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0,42603.0
mean,-0.081062,349.42544,4.30935,17.216144,0.191799,0.234951,0.168193,0.416074,0.172536,17290580.0,597.669296,3.854799,3.935861,9916.865,1202.656902,24.582236,0.118435,1.0,0.727916
std,1.251953,300.405685,0.298044,7.51906,0.034661,0.042021,0.041133,0.492912,0.195739,11000610.0,607.492832,1.290726,0.298256,48878.14,888.787799,50.724775,0.162808,0.0,0.128936
min,-5.0,1.0,1.75,1.0,0.0,0.0,0.0,0.0,-0.9966,1.0,1.0,0.0,1.25,3.0,7.0,2.0,0.020013,1.0,0.500005
25%,-0.66,149.0,4.124372,13.0,0.172524,0.21,0.145205,0.0,0.048057,7863218.0,190.0,3.0,3.76,533.0,470.0,4.0,0.031746,1.0,0.619963
50%,0.16,276.0,4.292857,16.36,0.191919,0.231939,0.164875,0.0,0.162037,18222540.0,405.0,4.0,3.95,1833.0,995.0,10.0,0.055503,1.0,0.726802
75%,0.86,463.0,4.480327,20.263932,0.210611,0.256039,0.186518,1.0,0.290107,25958520.0,820.0,5.0,4.14,6174.0,1757.0,26.0,0.122222,1.0,0.834764
max,2.7,3524.0,10.0,460.5,0.666667,1.0,1.0,1.0,0.9948,36439110.0,7938.0,5.0,5.0,2758812.0,3983.0,3394.0,1.0,1.0,0.996426


In [15]:
dat[dat["error"]=="TN"].describe()

Unnamed: 0,rating_diff,num_words,avg_word_len,avg_sent_len,pct_verbs,pct_nouns,pct_adj,quote,sentiment,book_id,user_reviews_y,user_rating_y,avg_rating,ratings_count,days_since_review_y,review_likes,like_share,popular,predictions
count,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0,168347.0
mean,-0.052876,132.543776,4.223289,14.595006,0.20102,0.227737,0.184634,0.183894,0.226986,13718710.0,95.346386,3.981847,4.034723,185972.5,1246.114989,2.594926,0.003986,0.0,0.280408
std,1.122693,172.254198,1.340729,7.779111,0.060533,0.068895,0.075548,0.387399,0.255068,9934744.0,161.602942,1.168556,0.271283,510113.4,822.619463,5.758932,0.00465,0.0,0.11719
min,-4.75,1.0,1.0,0.5,0.0,0.0,0.0,0.0,-0.9882,1.0,1.0,0.0,1.28,4.0,0.0,1.0,3e-05,0.0,0.004976
25%,-0.55,32.0,3.97,10.0,0.171429,0.193798,0.144941,0.0,0.062987,4667024.0,15.0,3.0,3.87,5900.0,600.0,1.0,0.000543,0.0,0.184225
50%,0.14,73.0,4.191781,13.8,0.2,0.222453,0.176471,0.0,0.220816,13602430.0,47.0,4.0,4.05,28053.0,1072.0,1.0,0.001908,0.0,0.274947
75%,0.8,160.0,4.428571,18.0,0.228395,0.255814,0.213235,0.0,0.38715,21969790.0,118.0,5.0,4.23,120403.0,1765.0,2.0,0.005952,0.0,0.376723
max,2.87,3634.0,452.0,408.0,1.0,1.0,1.0,1.0,0.9992,36413250.0,4105.0,5.0,4.95,4899965.0,4951.0,483.0,0.02,0.0,0.499998
