# Random Act of Pizza
# Group: Tian Zhu, Yucheng Liu

In [126]:
# Data and Data Structures
import json
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from nltk import sent_tokenize
import nltk
nltk.download('punkt')
from copy import deepcopy
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from datetime import datetime

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load data

In [115]:
# Load Train Data
dataset = pd.read_json("train.json")
# Load Test Data
sample_data = pd.read_json("test.json")
print("Dataset Shape:", dataset.shape)
print("Sample Shape:", sample_data.shape)
SEED = 0
train, test = train_test_split(dataset, test_size=0.2, random_state=SEED)

Dataset Shape: (4040, 32)
Sample Shape: (1631, 17)


# Original data without balancing for labels

In [129]:
# original data
# we can use this data for analysis purposes
train_data = train[train.columns.intersection(sample_data.columns)]
train_labels = train[['requester_received_pizza']]
train_all_labels = train[train.columns.difference(sample_data.columns)]
test_data = test[test.columns.intersection(sample_data.columns)]
test_labels = test[['requester_received_pizza']]
test_all_labels = test[test.columns.difference(sample_data.columns)]
print("Train data shape:",train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (3232, 17)
Test data shape: (808, 17)


# Data balanced for labels

In [65]:
# try balanced data instead
# this data can be used for 
train_positive = train[train['requester_received_pizza']==1]
train_negative = train[train['requester_received_pizza']==0]
true_weight_ratio = 1 + (train_negative['requester_received_pizza'].count() - train_positive['requester_received_pizza'].count())/(train_positive['requester_received_pizza'].count())
train_positive_balanced = train_positive.sample(frac=true_weight_ratio, replace=True, random_state=SEED)
train_balanced = pd.concat([train_positive_balanced, train_negative])

train_data = train_balanced[train_balanced.columns.intersection(sample_data.columns)]
train_labels = train_balanced[['requester_received_pizza']]
train_all_labels = train_balanced[train_balanced.columns.difference(sample_data.columns)]
test_data = test[test.columns.intersection(sample_data.columns)]
test_labels = test[['requester_received_pizza']]
test_all_labels = test[test.columns.difference(sample_data.columns)]
print("Train data shape:",train_data.shape)
print("Test data shape:", test_data.shape)

Train data shape: (4888, 17)
Test data shape: (808, 17)


# Data Exploration

In [95]:
# simple summary of the train data
train_data_summary = train_data.describe(include='all', datetime_is_numeric=True)
train_data_summary.transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
giver_username_if_known,3232.0,198.0,,3003.0,,,,,,,
request_id,3232.0,3232.0,t3_j7j1a,1.0,,,,,,,
request_text_edit_aware,3232.0,3143.0,,85.0,,,,,,,
request_title,3232.0,3220.0,Request,4.0,,,,,,,
requester_account_age_in_days_at_request,3232.0,,,,252.482858,303.259236,0.0,3.061678,155.608993,384.909401,2809.750787
requester_days_since_first_post_on_raop_at_request,3232.0,,,,15.988417,69.367534,0.0,0.0,0.0,0.0,785.457685
requester_number_of_comments_at_request,3232.0,,,,113.702042,192.466599,0.0,0.0,23.0,138.0,992.0
requester_number_of_comments_in_raop_at_request,3232.0,,,,0.647277,3.571609,0.0,0.0,0.0,0.0,88.0
requester_number_of_posts_at_request,3232.0,,,,21.643564,51.04076,0.0,0.0,4.0,22.0,867.0
requester_number_of_posts_on_raop_at_request,3232.0,,,,0.062809,0.329246,0.0,0.0,0.0,0.0,5.0


In [96]:
test_data_summary = test_data.describe(include='all', datetime_is_numeric=True)
test_data_summary.transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
giver_username_if_known,808.0,58.0,,750.0,,,,,,,
request_id,808.0,808.0,t3_1kwu2y,1.0,,,,,,,
request_text_edit_aware,808.0,790.0,,19.0,,,,,,,
request_title,808.0,807.0,[REQUEST],2.0,,,,,,,
requester_account_age_in_days_at_request,808.0,,,,263.001466,303.38354,0.0,4.243958,169.467633,418.139042,2521.263206
requester_days_since_first_post_on_raop_at_request,808.0,,,,18.131502,75.59019,0.0,0.0,0.0,0.0,711.307627
requester_number_of_comments_at_request,808.0,,,,120.683168,196.713105,0.0,0.0,27.0,149.5,994.0
requester_number_of_comments_in_raop_at_request,808.0,,,,0.636139,2.693566,0.0,0.0,0.0,0.0,47.0
requester_number_of_posts_at_request,808.0,,,,21.433168,50.339029,0.0,0.0,5.0,23.0,824.0
requester_number_of_posts_on_raop_at_request,808.0,,,,0.066832,0.311671,0.0,0.0,0.0,0.0,3.0


In [16]:
train_labels_summary = train_labels.describe(include='all', datetime_is_numeric=True)
train_labels_summary.transpose()

Unnamed: 0,count,unique,top,freq
requester_received_pizza,3232,2,False,2444


In [17]:
train_all_labels_summary = train_all_labels.describe(include='all', datetime_is_numeric=True)
train_all_labels_summary.transpose()

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
number_of_downvotes_of_request_at_retrieval,3232.0,,,,2.428527,3.008884,0.0,1.0,2.0,3.0,47.0
number_of_upvotes_of_request_at_retrieval,3232.0,,,,6.063738,10.198113,0.0,2.0,4.0,7.0,345.0
post_was_edited,3232.0,,,,105532862.834158,363764820.00859,0.0,0.0,0.0,0.0,1380909467.0
request_number_of_comments_at_retrieval,3232.0,,,,2.826114,4.700115,0.0,0.0,1.0,4.0,61.0
request_text,3232.0,3147.0,,85.0,,,,,,,
requester_account_age_in_days_at_retrieval,3232.0,,,,753.759653,332.67959,45.291562,518.132847,747.361644,898.264682,2879.276319
requester_days_since_first_post_on_raop_at_retrieval,3232.0,,,,516.696072,267.304158,0.0,276.471039,519.252373,774.833686,927.447442
requester_number_of_comments_at_retrieval,3232.0,,,,283.641089,354.198451,0.0,7.0,109.0,460.25,1000.0
requester_number_of_comments_in_raop_at_retrieval,3232.0,,,,2.589109,6.59052,0.0,0.0,1.0,3.0,139.0
requester_number_of_posts_at_retrieval,3232.0,,,,41.292079,83.375477,0.0,2.0,12.0,45.0,999.0


In [8]:
# check correlation
train_data.corr()

Unnamed: 0,requester_account_age_in_days_at_request,requester_days_since_first_post_on_raop_at_request,requester_number_of_comments_at_request,requester_number_of_comments_in_raop_at_request,requester_number_of_posts_at_request,requester_number_of_posts_on_raop_at_request,requester_number_of_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_plus_downvotes_at_request,unix_timestamp_of_request,unix_timestamp_of_request_utc
requester_account_age_in_days_at_request,1.0,0.231996,0.479725,0.063264,0.421913,0.049561,0.588291,0.2604,0.110017,0.330777,0.33077
requester_days_since_first_post_on_raop_at_request,0.231996,1.0,0.214223,0.384917,0.098793,0.318089,0.197523,0.120905,0.08771,0.170589,0.170588
requester_number_of_comments_at_request,0.479725,0.214223,1.0,0.127257,0.389384,0.088586,0.748661,0.369581,0.150042,0.29845,0.298444
requester_number_of_comments_in_raop_at_request,0.063264,0.384917,0.127257,1.0,0.021976,0.507074,0.081096,0.139785,0.141007,0.050708,0.050709
requester_number_of_posts_at_request,0.421913,0.098793,0.389384,0.021976,1.0,0.058758,0.606668,0.564952,0.380378,0.125631,0.125631
requester_number_of_posts_on_raop_at_request,0.049561,0.318089,0.088586,0.507074,0.058758,1.0,0.068938,0.145344,0.146675,0.110651,0.110647
requester_number_of_subreddits_at_request,0.588291,0.197523,0.748661,0.081096,0.606668,0.068938,1.0,0.393291,0.177837,0.330176,0.330172
requester_upvotes_minus_downvotes_at_request,0.2604,0.120905,0.369581,0.139785,0.564952,0.145344,0.393291,1.0,0.924393,0.166752,0.166752
requester_upvotes_plus_downvotes_at_request,0.110017,0.08771,0.150042,0.141007,0.380378,0.146675,0.177837,0.924393,1.0,0.087588,0.087589
unix_timestamp_of_request,0.330777,0.170589,0.29845,0.050708,0.125631,0.110651,0.330176,0.166752,0.087588,1.0,1.0


# Data Transformation

In [130]:
# extract the categorical/numerical fields to be separated from the free text analysis
columns_to_drop = ['giver_username_if_known', 'request_id', 'request_text_edit_aware', 'request_title', 'requester_subreddits_at_request', 'requester_username']
train_data_regression = train_data.drop(columns_to_drop, axis=1)
test_data_regression = test_data.drop(columns_to_drop, axis=1)

In [131]:
# convert time stamp to capture seasonality and time variant
train_data_regression['year_of_request'] = train_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).year)
train_data_regression['month_of_request'] = train_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).month)
train_data_regression['day_of_request'] = train_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).day)
train_data_regression['hour_of_request'] = train_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).hour)
train_data_regression['week_day_of_request'] = train_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).weekday())

test_data_regression['year_of_request'] = test_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).year)
test_data_regression['month_of_request'] = test_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).month)
test_data_regression['day_of_request'] = test_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).day)
test_data_regression['hour_of_request'] = test_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).hour)
test_data_regression['week_day_of_request'] = test_data_regression['unix_timestamp_of_request_utc'].apply(lambda x: datetime.fromtimestamp(x).weekday())

# drop the timestamp
columns_to_drop = ['unix_timestamp_of_request_utc', 'unix_timestamp_of_request']
train_data_regression = train_data_regression.drop(columns_to_drop, axis=1)
test_data_regression = test_data_regression.drop(columns_to_drop, axis=1)

# Model based on numerical/categorical only

## KNN

In [140]:
model_knn = KNeighborsClassifier(algorithm='brute')
parameters = {'n_neighbors':[3, 5, 7, 9, 11, 13, 15, 17]}
clf = GridSearchCV(
    estimator=model_knn,
    param_grid=parameters, cv=5, scoring='f1_macro')
clf.fit(train_data_regression, train_labels['requester_received_pizza'])
print("Best k = ", clf.best_params_)

model_knn = KNeighborsClassifier(n_neighbors=clf.best_params_['n_neighbors'])
model_knn.fit(train_data_regression, train_labels['requester_received_pizza'])
# predict on the dev data
model_knn_prediction = model_knn.predict(test_data_regression)
print(classification_report(y_true=test_labels['requester_received_pizza'], y_pred=model_knn_prediction))

Best k =  {'n_neighbors': 3}
              precision    recall  f1-score   support

       False       0.76      0.87      0.81       602
        True       0.35      0.20      0.26       206

    accuracy                           0.70       808
   macro avg       0.56      0.54      0.54       808
weighted avg       0.66      0.70      0.67       808



## Regression

In [141]:
model_regression = LogisticRegression(solver="liblinear")
Cs = list(np.logspace(-4, 4, 20))
parameters = {'C':Cs}
clf = GridSearchCV(
    estimator=model_regression,
    param_grid=parameters, cv=5, scoring='f1_macro')
clf.fit(train_data_regression, train_labels['requester_received_pizza'])
print("Best C = ", clf.best_params_)

model_regression = LogisticRegression(C=clf.best_params_['C'], solver="liblinear")
model_regression.fit(train_data_regression, train_labels['requester_received_pizza'])
# predict on the dev data
model_regression_prediction = model_regression.predict(test_data_regression)
print(classification_report(y_true=test_labels['requester_received_pizza'], y_pred=model_regression_prediction))

Best C =  {'C': 4.281332398719396}
              precision    recall  f1-score   support

       False       0.75      0.99      0.85       602
        True       0.60      0.03      0.06       206

    accuracy                           0.75       808
   macro avg       0.67      0.51      0.45       808
weighted avg       0.71      0.75      0.65       808



## Random Forest

In [139]:
# Random Forest
model_rf = RandomForestClassifier()
n_estimators = list(range(10,101,10))
max_features = list(range(6,len(train_data_regression.columns),2))
parameters = {'n_estimators':n_estimators, 'max_features':max_features}
clf = GridSearchCV(
    estimator=model_rf,
    param_grid=parameters, cv=5, scoring='f1_macro')
clf.fit(train_data_regression, train_labels['requester_received_pizza'])
print("Best Params = ", clf.best_params_)

model_rf = RandomForestClassifier(n_estimators=clf.best_params_['n_estimators'], max_features=clf.best_params_['max_features'])
model_rf.fit(train_data_regression, train_labels['requester_received_pizza'])
# predict on the dev data
model_rf_prediction = model_rf.predict(test_data_regression)
print(classification_report(y_true=test_labels['requester_received_pizza'], y_pred=model_rf_prediction))

Best Params =  {'max_features': 12, 'n_estimators': 100}
              precision    recall  f1-score   support

       False       0.76      0.96      0.85       602
        True       0.50      0.12      0.20       206

    accuracy                           0.75       808
   macro avg       0.63      0.54      0.52       808
weighted avg       0.69      0.75      0.68       808



# Machine Learning with XlNet

In [156]:
# an arbiturary set of hypter parameters.
def get_args():
    """Return a dictionary that can be passed into the model trainng function."""
    args = {
        "output_dir": "matching_model_class_weight_outputs/",
        "cache_dir": "matching_model_class_weight_cache_dir/",
        
        "fp16": True,
        "fp16_opt_level": "O1",
        "max_seq_length": 256,
        "train_batch_size": 16,
        "gradient_accumulation_steps": 1,
        "eval_batch_size": 16,
        "num_train_epochs": 1,
        "weight_decay": 0,
        "learning_rate": 8.0e-4, 
        "adam_epsilon": 1e-8,
        "warmup_ratio": 0.06,
        "warmup_steps": 0,
        "max_grad_norm": 1.0,

        "logging_steps": 50,
        "save_steps": 2000,

        "overwrite_output_dir": True,
        "reprocess_input_data": False,
        "evaluate_during_training": False,

        "process_count": 1,
        "n_gpu": 1
    }
    return args

In [144]:
training_args = get_args()

In [147]:
# prep the train/test data
df_train = pd.DataFrame()
df_train['text'] = train_data['request_title'] + " " + train_data['request_text_edit_aware']
df_train['labels'] = train_labels['requester_received_pizza'].astype(int)
df_test = pd.DataFrame()
df_test['text'] = test_data['request_title'] + " " + test_data['request_text_edit_aware']
df_test['labels'] = test_labels['requester_received_pizza'].astype(int)

In [148]:
# do sentence tokenization
df_train_sent = df_train.copy(deep=False)
df_train_sent['text'] = df_train['text'].apply(lambda x: sent_tokenize(x))
df_train_sent = df_train_sent.explode('text')
df_train_sent = df_train_sent.sample(frac=1, random_state=9999)

In [54]:
# check some examples
with pd.option_context('display.max_rows', None, 'display.max_columns', None, 'display.max_colwidth', None):  # more options can be specified also
    display(train_data['request_title'].head())

809                                                                                                                [Request AND offer!]
2636                                                                                                  [Request] Pizza for me and my boy
2003    [Request] Full-time student dad with 3 kids. Finals are next week, so I'm finishing projects today. Please help? (Portland, OR)
3854                          [Request] My dog of 18 years passed away last night, I really don't feel like cooking or doing much today
2023                                                                                  [Request] Pizza for my friend just out of surgery
Name: request_title, dtype: object

In [157]:
# use xlnet base cased model for transfer leraning
matching_model = ClassificationModel('xlnet', 'xlnet-base-cased', args=training_args, use_cuda=True)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [145]:
# for loading pretrained models only
matching_model = ClassificationModel('xlnet', model_name='matching_model_class_weight_outputs', args=training_args, use_cuda=True)

In [158]:
# train on the shuffled sample
torch.cuda.empty_cache()
matching_model.train_model(df_train_sent.sample(frac=1, random_state=9999))

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=1.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 1', max=1840.0, style=ProgressStyle(de…

  model.parameters(), args.max_grad_norm






(1840, 0.6983364602793818)

In [161]:
# evaluate on dev set
result, model_outputs, wrong_predictions = matching_model.eval_model(df_test)

HBox(children=(FloatProgress(value=0.0, description='Running Evaluation', max=51.0, style=ProgressStyle(descri…




In [162]:

result

{'mcc': 0.11606809287286371,
 'tp': 178,
 'tn': 148,
 'fp': 454,
 'fn': 28,
 'auroc': 0.5907855691384705,
 'auprc': 0.3080703756009108,
 'eval_loss': 0.7444087009803921}

In [159]:
predictions, raw_outputs = matching_model.predict(df_test['text'].values)

HBox(children=(FloatProgress(value=0.0, max=51.0), HTML(value='')))




In [160]:
print(classification_report(y_true=test_labels['requester_received_pizza'], y_pred=predictions))

              precision    recall  f1-score   support

       False       0.84      0.25      0.38       602
        True       0.28      0.86      0.42       206

    accuracy                           0.40       808
   macro avg       0.56      0.55      0.40       808
weighted avg       0.70      0.40      0.39       808

