# Getting the New Balance data from the reddit subreddit r/Newbalance 

In [1]:
import requests 
import time 
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
pd.set_option("display.max_rows", 101) # in order to display all rows

In [3]:
# set the parameters outside so I dont have to hardcode in the for loop 
# gets all posts from reddit in general 
# also set the before parameter with first created_utc found in initial call of the submissions 

url = 'https://api.pushshift.io/reddit/search/submission'
subreddit = 'Newbalance'
before = 1632441624

df_list = []

In [4]:
for _ in range (10):
    params = {
        'subreddit': subreddit, 
        'size': 100,
        'before': before
        }
    res = requests.get(url, params= params)
    data = res.json()
    
    before = data['data'][-1]['created_utc']
    print(f'before updated to: {before}')
    
    post_df = pd.DataFrame(data['data'])
    df_list.append(post_df)
    
    time.sleep(3)
    
newbalance_df = pd.concat(df_list)

before updated to: 1632017350
before updated to: 1631567058
before updated to: 1631235730
before updated to: 1630866406
before updated to: 1630429936
before updated to: 1629987131
before updated to: 1629480585
before updated to: 1629201691
before updated to: 1628732055
before updated to: 1628362599


### Looking at the data 

In [5]:
newbalance_df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,removed_by_category,media,media_embed,secure_media,secure_media_embed,author_cakeday,crosspost_parent,crosspost_parent_list,author_flair_background_color,author_flair_text_color
0,[],False,frogmicky,,[],,text,t2_13xjy6,False,False,...,,,,,,,,,,
1,[],False,theillones,,[],,text,t2_7qn37z9o,False,False,...,,,,,,,,,,
2,[],False,theillones,,[],,text,t2_7qn37z9o,False,False,...,,,,,,,,,,
3,[],False,theillones,,[],,text,t2_7qn37z9o,False,False,...,,,,,,,,,,
4,[],False,sjale49,,[],,text,t2_7s0vs32j,False,False,...,,,,,,,,,,


In [6]:
newbalance_df.shape

(1000, 78)

In [7]:
newbalance_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_richtext', 'link_flair_text_color', 'link_flair_type',
       'locked', 'media_metadata', 'media_only', 'no_follow', 'num_comments',
       'num_crossposts', 'over_18', 'parent_whitelist_status', 'permalink',
       'pinned', 'pwls', 'retrieved_on', 'score', 'selftext', 'send_replies',
       'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subreddit_type', 'thumbnail', 'title',


In [8]:
newbalance_df.isnull().sum().sort_values(ascending=False)

author_flair_css_class           1000
author_flair_text                1000
author_cakeday                    997
author_flair_text_color           995
author_flair_background_color     995
crosspost_parent_list             994
crosspost_parent                  994
media                             993
secure_media_embed                993
secure_media                      993
media_embed                       993
poll_data                         989
removed_by_category               936
gallery_data                      866
media_metadata                    858
is_gallery                        847
preview                           455
post_hint                         455
thumbnail_height                  383
thumbnail_width                   383
url_overridden_by_dest            281
author_flair_richtext               5
author_patreon_flair                5
author_fullname                     5
author_flair_type                   5
author_premium                      5
thumbnail   

In [9]:
newbalance_df.dropna(thresh=1000, axis=1, inplace=True)

In [10]:
newbalance_df.isnull().sum().sort_values(ascending=False)

all_awardings                  0
allow_live_comments            0
num_crossposts                 0
over_18                        0
parent_whitelist_status        0
permalink                      0
pinned                         0
pwls                           0
retrieved_on                   0
score                          0
selftext                       0
send_replies                   0
spoiler                        0
stickied                       0
subreddit                      0
subreddit_id                   0
subreddit_subscribers          0
subreddit_type                 0
thumbnail                      0
title                          0
total_awards_received          0
treatment_tags                 0
upvote_ratio                   0
url                            0
whitelist_status               0
num_comments                   0
no_follow                      0
media_only                     0
is_created_from_ads_ui         0
author                         0
author_is_

In [11]:
newbalance_df.shape

(1000, 52)

### Building the dataset

In [12]:
#pay attention to the subreddit, selftext, and title columns 

newbalance_df[['subreddit', 'selftext', 'title']].head()

Unnamed: 0,subreddit,selftext,title
0,Newbalance,&amp;#x200B;\n\n*Processing img 3fgy0ooh4cp71...*,Ordered a pair of these my first 327's Im in l...
1,Newbalance,,Started grey day ....getting a lil hooked
2,Newbalance,,Started grey day ...getting a lil hooked
3,Newbalance,,Started on Grey Day...getting a lil hooked
4,Newbalance,,When will these drop on the NB website? I have...


In [13]:
newbalance_df['title'].value_counts()

Few recent wears                                                     3
Can someone help me ID the kicks?                                    3
Added the 997.5 to my collection                                     2
Outlet finds                                                         2
New Balance 990v3 in “Made 990 Version Series”                       2
                                                                    ..
Salty Phantoms                                                       1
Going half size down for the 2002r ?                                 1
Can anyone ID these for me?                                          1
What on Gods green earth is going on with this hype and what not?    1
Bodega collab                                                        1
Name: title, Length: 982, dtype: int64

### Tokenize the title variable 

In [14]:
tokenizer = RegexpTokenizer(pattern=r'\w+')

newbalance_df['title_token'] = newbalance_df['title'].apply(lambda row: tokenizer.tokenize(row.lower()))

newbalance_df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_is_blocked,awarders,can_mod_post,contest_mode,created_utc,domain,full_link,...,subreddit_type,thumbnail,title,total_awards_received,treatment_tags,upvote_ratio,url,whitelist_status,wls,title_token
0,[],False,frogmicky,False,[],False,False,1632438577,self.Newbalance,https://www.reddit.com/r/Newbalance/comments/p...,...,public,self,Ordered a pair of these my first 327's Im in l...,0,[],1.0,https://www.reddit.com/r/Newbalance/comments/p...,all_ads,6,"[ordered, a, pair, of, these, my, first, 327, ..."
1,[],False,theillones,False,[],False,False,1632437702,reddit.com,https://www.reddit.com/r/Newbalance/comments/p...,...,public,default,Started grey day ....getting a lil hooked,0,[],1.0,https://www.reddit.com/gallery/pu6sqk,all_ads,6,"[started, grey, day, getting, a, lil, hooked]"
2,[],False,theillones,False,[],False,False,1632437590,reddit.com,https://www.reddit.com/r/Newbalance/comments/p...,...,public,default,Started grey day ...getting a lil hooked,0,[],1.0,https://www.reddit.com/gallery/pu6rko,all_ads,6,"[started, grey, day, getting, a, lil, hooked]"
3,[],False,theillones,False,[],False,False,1632431098,reddit.com,https://www.reddit.com/r/Newbalance/comments/p...,...,public,default,Started on Grey Day...getting a lil hooked,0,[],1.0,https://www.reddit.com/gallery/pu4p8n,all_ads,6,"[started, on, grey, day, getting, a, lil, hooked]"
4,[],False,sjale49,False,[],False,False,1632430232,i.redd.it,https://www.reddit.com/r/Newbalance/comments/p...,...,public,image,When will these drop on the NB website? I have...,0,[],1.0,https://i.redd.it/qm6jfwbofbp71.jpg,all_ads,6,"[when, will, these, drop, on, the, nb, website..."


### Lemmatize the title variable 

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
newbalance_df['title_token'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

0     [ordered, a, pair, of, these, my, first, 327, ...
1         [started, grey, day, getting, a, lil, hooked]
2         [started, grey, day, getting, a, lil, hooked]
3     [started, on, grey, day, getting, a, lil, hooked]
4     [when, will, these, drop, on, the, nb, website...
                            ...                        
95                 [my, 990s, are, cooler, than, yours]
96    [anyone, know, what, happened, with, the, joef...
97    [57, 40, s, paired, with, the, nb, short, only...
98    [say, what, you, want, but, these, are, all, v...
99                                     [latest, pickup]
Name: title_token, Length: 1000, dtype: object

### Stem the title variable 

In [17]:
p_stemmer = PorterStemmer()

In [18]:
newbalance_df['title_token'].apply(lambda tokens: [p_stemmer.stem(token) for token in tokens])

0     [order, a, pair, of, these, my, first, 327, s,...
1                 [start, grey, day, get, a, lil, hook]
2                 [start, grey, day, get, a, lil, hook]
3             [start, on, grey, day, get, a, lil, hook]
4     [when, will, these, drop, on, the, nb, websit,...
                            ...                        
95                   [my, 990, are, cooler, than, your]
96    [anyon, know, what, happen, with, the, joefres...
97    [57, 40, s, pair, with, the, nb, short, onli, ...
98    [say, what, you, want, but, these, are, all, v...
99                                     [latest, pickup]
Name: title_token, Length: 1000, dtype: object

### What the tokens would look like with no stop words

In [19]:
eng_stopwords = stopwords.words('english')
newbalance_df['title_token'].apply(lambda tokens: [token for token in tokens if token not in eng_stopwords])

0                 [ordered, pair, first, 327, im, love]
1            [started, grey, day, getting, lil, hooked]
2            [started, grey, day, getting, lil, hooked]
3            [started, grey, day, getting, lil, hooked]
4     [drop, nb, website, seen, online, shops, want,...
                            ...                        
95                                       [990s, cooler]
96    [anyone, know, happened, joefreshgoods, outsid...
97    [57, 40, paired, nb, shorts, way, love, reflec...
98                                   [say, want, vibes]
99                                    [latest, pickups]
Name: title_token, Length: 1000, dtype: object

### Remerge the title tokens to have a "stripped" object to analyze 

In [20]:
newbalance_df['title_tokens_merged'] = newbalance_df['title_token'].apply(lambda token: ' '.join(token))

### Build the final dataset for analysis

In [21]:
newbalance_df = newbalance_df[['title_token', 'title_tokens_merged', 'subreddit']]

In [22]:
newbalance_df.head()

Unnamed: 0,title_token,title_tokens_merged,subreddit
0,"[ordered, a, pair, of, these, my, first, 327, ...",ordered a pair of these my first 327 s im in love,Newbalance
1,"[started, grey, day, getting, a, lil, hooked]",started grey day getting a lil hooked,Newbalance
2,"[started, grey, day, getting, a, lil, hooked]",started grey day getting a lil hooked,Newbalance
3,"[started, on, grey, day, getting, a, lil, hooked]",started on grey day getting a lil hooked,Newbalance
4,"[when, will, these, drop, on, the, nb, website...",when will these drop on the nb website i have ...,Newbalance


In [23]:
newbalance_df.to_csv('datasets/newbalance_data.csv', index=False)