In [16]:
# IMPORTING PACKAGES
import pandas as pd
import re
import seaborn as sns
import altair as alt
import string
import matplotlib.pyplot as plt
import nltk

from nltk.corpus import stopwords
from porter2stemmer import Porter2Stemmer
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/amyhuynh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
plt.rcParams["figure.figsize"] = (10,8)

In [14]:
# DISPLAY SETTINGS
pd.set_option('display.width', 150)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)

  This is separate from the ipykernel package so we can avoid doing imports until


In [4]:
df = pd.read_json("train.json")

In [5]:
df.notnull().mean() * 100

giver_username_if_known                                 100.00000
number_of_downvotes_of_request_at_retrieval             100.00000
number_of_upvotes_of_request_at_retrieval               100.00000
post_was_edited                                         100.00000
request_id                                              100.00000
request_number_of_comments_at_retrieval                 100.00000
request_text                                            100.00000
request_text_edit_aware                                 100.00000
request_title                                           100.00000
requester_account_age_in_days_at_request                100.00000
requester_account_age_in_days_at_retrieval              100.00000
requester_days_since_first_post_on_raop_at_request      100.00000
requester_days_since_first_post_on_raop_at_retrieval    100.00000
requester_number_of_comments_at_request                 100.00000
requester_number_of_comments_at_retrieval               100.00000
requester_

In [6]:
punct_list = [x for x in string.punctuation]
stop = list(set(stopwords.words('english')))
stemmer = Porter2Stemmer()

### creating needed features
df["len_request_text"] = df.request_text.apply(lambda x: len(x.split(" ")))
df['num_punct_request_text'] = df.request_text.apply(lambda x: len([letter for letter in x if letter in punct_list]))
df['num_emoticons_request_text'] = df.request_text.apply(lambda x: len(re.findall('\:(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])', x)) + len(re.findall(r'(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])\:', x)))

df["len_request_text_edit_aware"] = df.request_text_edit_aware.apply(lambda x: len(x.split(" ")))
df['num_punct_request_text_edit_aware'] = df.request_text_edit_aware.apply(lambda x: len([letter for letter in x if letter in punct_list]))
df['num_emoticons_request_text_edit_aware'] = df.request_text_edit_aware.apply(lambda x: len(re.findall('\:(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])', x)) + len(re.findall(r'(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])\:', x)))

df["len_request_title"] = df.request_title.apply(lambda x: len(x.split(" ")))
df['num_punct_request_title'] = df.request_title.apply(lambda x: len([letter for letter in x if letter in punct_list]))
df['num_emoticons_request_title'] = df.request_title.apply(lambda x: len(re.findall('\:(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])', x)) + len(re.findall(r'(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])\:', x)))

### cleaning text
df["request_text_cleaned"] = df.request_text.apply(lambda x: re.sub(r'(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])\:', '', x))
df["request_text_cleaned"] = df.request_text_cleaned.apply(lambda x: re.sub(r'\:(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])', '', x))
df["request_text_cleaned"] = df.request_text_cleaned.apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))
df["request_text_cleaned"] = df.request_text_cleaned.apply(lambda x: " ".join(word for word in x.split(" ") if word not in stop))
df["request_text_cleaned"] = df.request_text_cleaned.apply(lambda x: re.sub(r'\s{2,}', ' ', x))
df["request_text_cleaned"] = df.request_text_cleaned.apply(lambda x: " ".join(stemmer.stem(word) for word in x.split(" ")))

# df["request_text_edit_aware_cleaned"] = df.request_text_edit_aware.apply(lambda x: re.sub(r'(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])\:', '', x))
# df["request_text_edit_aware_cleaned"] = df.request_text_edit_aware_cleaned.apply(lambda x: re.sub(r'\:(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])', '', x))
# df["request_text_edit_aware_cleaned"] = df.request_text_edit_aware_cleaned.apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))
# df["request_text_edit_aware_cleaned"] = df.request_text_edit_aware_cleaned.apply(lambda x: re.sub(r'\s{2,}', ' ', x))
# df["request_text_edit_aware_cleaned"] = df.request_text_edit_aware_cleaned.apply(lambda x: " ".join(stemmer.stem(word) for word in x.split(" ")))

df["request_title_cleaned"] = df.request_title.apply(lambda x: re.sub(r'(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])\:', '', x))
df["request_title_cleaned"] = df.request_title_cleaned.apply(lambda x: re.sub(r'\:(?:\(|\)|c|D|C|p|P|o|O|\||\[|\])', '', x))
df["request_title_cleaned"] = df.request_title_cleaned.apply(lambda x: x.lower().translate(str.maketrans('', '', string.punctuation)))
df["request_title_cleaned"] = df.request_title_cleaned.apply(lambda x: re.sub(r'\s{2,}', ' ', x))
df["request_title_cleaned"] = df.request_title_cleaned.apply(lambda x: " ".join(stemmer.stem(word) for word in x.split(" ")))

In [7]:
senti_analyzer = SentimentIntensityAnalyzer()

In [8]:
df['request_text_vader_senti'] = df.request_text.apply(lambda x: senti_analyzer.polarity_scores(x))
df['request_text_vader_neg'] = df.request_text_vader_senti.apply(lambda x: x['neg'])
df['request_text_vader_pos'] = df.request_text_vader_senti.apply(lambda x: x['pos'])
df['request_text_vader_neu'] = df.request_text_vader_senti.apply(lambda x: x['neu'])

In [9]:
df['request_title_vader_senti'] = df.request_title.apply(lambda x: senti_analyzer.polarity_scores(x))
df['request_title_vader_neg'] = df.request_title_vader_senti.apply(lambda x: x['neg'])
df['request_title_vader_pos'] = df.request_title_vader_senti.apply(lambda x: x['pos'])
df['request_title_vader_neu'] = df.request_title_vader_senti.apply(lambda x: x['neu'])

In [10]:
df[["request_text", "request_text_cleaned", "request_text_vader_senti"]]

Unnamed: 0,request_text,request_text_cleaned,request_text_vader_senti
0,Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated,hi need food 4 children militari famili realli hit hard time exahust mean help abl fe famili make anoth night ask know bless come whatev u find heart give great appreci,"{'neg': 0.022, 'neu': 0.801, 'pos': 0.177, 'compound': 0.9188}"
1,I spent the last money I had on gas today. Im broke until next Thursday :(,spent last money gas today im broke next thursday,"{'neg': 0.289, 'neu': 0.711, 'pos': 0.0, 'compound': -0.6908}"
2,My girlfriend decided it would be a good idea to get off at Perth bus station when she was coming to visit me and has since had to spend all her money on a taxi to get to me here in Dundee. Any chance some kind soul would get us some pizza since we don't have any cash anymore?,girlfriend decid would good idea get perth bus station come visit sinc spend money taxi get dunde chanc kind soul would get us pizza sinc dont cash anymor,"{'neg': 0.0, 'neu': 0.871, 'pos': 0.129, 'compound': 0.8074}"
3,"It's cold, I'n hungry, and to be completely honest I'm broke. My mum said we're having leftovers for dinner. A random pizza arriving would be nice.\n\nEdit: We had leftovers.",cold hungri complet honest im broke mum said leftov dinner random pizza arriv would nice edit leftov,"{'neg': 0.084, 'neu': 0.741, 'pos': 0.175, 'compound': 0.5154}"
4,"hey guys:\n I love this sub. I think it's great. (Except the sob stories. I miss when this place was fun!) Anywho, I've given a pizza out before so thought I would try my luck at getting one. My friend, who lives an hour away and our schedules do not let us see each other too much, decided to come down and visit me for the night! I would love to be able to be a good host and order her a pizza to go with some beer!\n\nAgain, no sob story. Just looking to share a pizza with an old friend :)",hey guy love sub think great except sob stori miss place fun anywho ive given pizza thought would tri luck get one friend live hour away schedul let us see much decid come visit night would love abl good host order pizza go beer again sob stori look share pizza old friend,"{'neg': 0.028, 'neu': 0.695, 'pos': 0.277, 'compound': 0.9865}"
...,...,...,...
4035,"Is anyone out there kind enough to help me out? I started a new job Monday (in this day and age, I consider this a win) but won't get paid until the 15th. Spent all my graduation money on the deposit for my apartment near work. I am hungry and can show my pathetic empty fridge and equally pathetic bank account with $1.97 in it. Just throwing this out there if anyone has the heart to help a dude out. If so, I promise to help out another in need on the 15th when I receive my paycheck. Thanks for reading anyway.",anyon kind enough help start new job monday day age consid win wont get paid 15th spent graduat money deposit apart near work hungri show pathet empti fridg equal pathet bank account 197 throw anyon heart help dude promis help anoth ne 15th receiv paycheck thank read anyway,"{'neg': 0.095, 'neu': 0.703, 'pos': 0.202, 'compound': 0.9161}"
4036,If someone could hook me up with a $15 gift card I would happily hook someone up with pizza tomorrow. Here is the link to the Papa Johns promotion http://www.papajohns.com/touchdown/\n\nThanks Reddit! &lt;3,someon could hook 15 gift card would happili hook someon pizza tomorrow link papa john promot httwwwpapajohnscomtouchdown thank reddit lt3,"{'neg': 0.0, 'neu': 0.756, 'pos': 0.244, 'compound': 0.8655}"
4037,"Have today off, soo I'll be stuck in the house all day cleaning and doing homework that I've put off for a week... Just looking for a pizza pie for lunch! :) \n\nEDIT: Welp, got hungry and finished my homework, so I ordered Jimmy Johns instead, maybe next time! :)",today soo ill stuck hous day clean homework ive put week look pizza pie lunch edit welp got hungri finish homework order jimmi john instead mayb next time,"{'neg': 0.036, 'neu': 0.846, 'pos': 0.118, 'compound': 0.6792}"
4038,"I've never done anything like this before, but I am willing to try this out. I am the proud mother of an awesome toddler whose favorite food is pizza. Today is the first day of our staycation, but I am poor until Friday when I get paid. Hoping to make my little man smile by having some pizza for lunch",ive never done anyth like will tri proud mother awesom toddl whos favorit food pizza today first day staycat poor friday get paid hope make littl man smile pizza lunch,"{'neg': 0.072, 'neu': 0.671, 'pos': 0.257, 'compound': 0.9488}"


In [15]:
df.head(1)

Unnamed: 0,giver_username_if_known,number_of_downvotes_of_request_at_retrieval,number_of_upvotes_of_request_at_retrieval,post_was_edited,request_id,request_number_of_comments_at_retrieval,request_text,request_text_edit_aware,request_title,requester_account_age_in_days_at_request,requester_account_age_in_days_at_retrieval,requester_days_since_first_post_on_raop_at_request,requester_days_since_first_post_on_raop_at_retrieval,requester_number_of_comments_at_request,requester_number_of_comments_at_retrieval,requester_number_of_comments_in_raop_at_request,requester_number_of_comments_in_raop_at_retrieval,requester_number_of_posts_at_request,requester_number_of_posts_at_retrieval,requester_number_of_posts_on_raop_at_request,requester_number_of_posts_on_raop_at_retrieval,requester_number_of_subreddits_at_request,requester_received_pizza,requester_subreddits_at_request,requester_upvotes_minus_downvotes_at_request,requester_upvotes_minus_downvotes_at_retrieval,requester_upvotes_plus_downvotes_at_request,requester_upvotes_plus_downvotes_at_retrieval,requester_user_flair,requester_username,unix_timestamp_of_request,unix_timestamp_of_request_utc,len_request_text,num_punct_request_text,num_emoticons_request_text,len_request_text_edit_aware,num_punct_request_text_edit_aware,num_emoticons_request_text_edit_aware,len_request_title,num_punct_request_title,num_emoticons_request_title,request_text_cleaned,request_title_cleaned,request_text_vader_senti,request_text_vader_neg,request_text_vader_pos,request_text_vader_neu,request_title_vader_senti,request_title_vader_neg,request_title_vader_pos,request_title_vader_neu
0,,0,1,0,t3_l25d7,0,Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated,Hi I am in need of food for my 4 children we are a military family that has really hit hard times and we have exahusted all means of help just to be able to feed my family and make it through another night is all i ask i know our blessing is coming so whatever u can find in your heart to give is greatly appreciated,Request Colorado Springs Help Us Please,0.0,792.420405,0.0,792.420405,0,0,0,0,0,1,0,1,0,False,[],0,1,0,1,,nickylvst,1317852607,1317849007,67,0,0,67,0,0,6,0,0,hi need food 4 children militari famili realli hit hard time exahust mean help abl fe famili make anoth night ask know bless come whatev u find heart give great appreci,request colorado spring help us pleas,"{'neg': 0.022, 'neu': 0.801, 'pos': 0.177, 'compound': 0.9188}",0.022,0.177,0.801,"{'neg': 0.0, 'neu': 0.444, 'pos': 0.556, 'compound': 0.6124}",0.0,0.556,0.444


In [23]:
train, test_big = train_test_split(df, test_size=0.25, random_state = 314)

In [26]:
dev, test = train_test_split(test_big, test_size=0.5, random_state = 314)

In [32]:
train.reset_index(inplace = True, drop = True)
test.reset_index(inplace = True, drop = True)
dev.reset_index(inplace = True, drop = True)

In [34]:
# df.to_csv("train_cleaned.csv", index=False)

# train.to_json('train_data_cleaned.json', orient='records', lines=True)
# test.to_json('test_data_cleaned.json', orient='records', lines=True)
# dev.to_json('dev_data_cleaned.json', orient='records', lines=True)

In [None]:
corr_heatmap = sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=False, linewidths=4)
corr_heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12)

In [None]:
heatmap = sns.heatmap(df.corr()[['requester_received_pizza']].sort_values(by='requester_received_pizza', ascending=False), vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Features Correlating with Receiving Pizza', fontdict={'fontsize':18}, pad=16);

In [None]:
df['requester_received_pizza'] = pd.Categorical(df['requester_received_pizza'],
                                   categories=[True, False],
                                   ordered=False)

bar1 = sns.countplot(
    data= df,
    x="requester_received_pizza", 
    hue = "requester_number_of_posts_on_raop_at_retrieval",
    palette="dark", alpha=.6)

plt.xlabel("requester received pizza")
plt.ylabel("count")
plt.title("Receiving Pizza v.s. Number of posts in RAOP")

plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

In [None]:
alt.Chart(df, title = "Receiving Pizza v.s. Number of posts in RAOP").mark_bar().encode(
    x = alt.X("requester_received_pizza", title = "request received pizza"),
    y = alt.Y("count()", title = "count"),
    color = "requester_number_of_posts_on_raop_at_retrieval",
    tooltip = ["requester_received_pizza", "count()", "requester_number_of_posts_on_raop_at_retrieval"]
    )

In [None]:
len(df)

In [None]:
4040 * .8