In [2]:
!pip install pyarrow pandas duckdb



In [3]:
import pyarrow.parquet as pq
import pandas
import glob
import duckdb
import datetime
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer

In [5]:
%%time
reviews = pandas.read_csv("reviews/reviews.csv.gz", compression='gzip', index_col=0)
ratings = pandas.read_csv("reviews/ratings.csv", index_col=0)
products = pandas.read_csv("reviews/products.csv", index_col=0)
categories = pandas.read_csv("reviews/categories.csv", index_col=0)

reviews_with_ratings = reviews.merge(ratings, on='review_id')
products_with_categories = products.merge(left_on='category_id', right_on='id', right=categories)
reviews_with_products_and_ratings = reviews_with_ratings.merge(products_with_categories, on='product_id')
train = reviews_with_products_and_ratings[reviews_with_products_and_ratings['review_date'] <= '2015-08-31']
train['is_helpful'] = train['helpful_votes'] > 0
median_total_votes_per_category = train.groupby('category_id').agg(median_total_votes=('total_votes', 'median'))
train_with_median_categories = train.merge(median_total_votes_per_category, on='category_id')
train_with_median_categories['above_median_total_votes'] = train_with_median_categories['total_votes'] > train_with_median_categories['median_total_votes']

featurization = ColumnTransformer([('scale', StandardScaler(), ['star_rating']),
                                   ('onehot', OneHotEncoder(), ['category_id']),
                                   ('passthrough', 'passthrough', ['above_median_total_votes'])])

CPU times: user 1.35 s, sys: 82.2 ms, total: 1.43 s
Wall time: 1.46 s


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [6]:
train_featurized = featurization.fit_transform(train_with_median_categories)
train_labels = train_with_median_categories['is_helpful']

# You can ignore this part
model = DecisionTreeClassifier().fit(train_featurized, train_labels)
print(model.score(train_featurized, train_labels))
train_featurized

0.889076672300169


array([[-1.08386907,  1.        ,  0.        ,  1.        ],
       [ 0.8037779 ,  1.        ,  0.        ,  0.        ],
       [ 0.8037779 ,  1.        ,  0.        ,  0.        ],
       ...,
       [-1.08386907,  0.        ,  1.        ,  1.        ],
       [ 0.17456225,  0.        ,  1.        ,  1.        ],
       [-1.08386907,  0.        ,  1.        ,  1.        ]])