# Getting the Nike data from the reddit subreddit r/Nike 

In [1]:
import requests 
import time 
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [2]:
pd.set_option("display.max_rows", 101) # in order to display all rows

In [3]:
# set the parameters outside so I dont have to hardcode in the for loop 
# gets all posts from reddit in general 
# also set the before parameter with first created_utc found in initial call of the submissions 

url = 'https://api.pushshift.io/reddit/search/submission'
subreddit = 'Nike'
before = 1633216217

df_list = []

In [4]:
for _ in range (10):
    params = {
        'subreddit': subreddit, 
        'size': 100,
        'before': before
        }
    res = requests.get(url, params= params)
    data = res.json()
    
    before = data['data'][-1]['created_utc']
    print(f'before updated to: {before}')
    
    post_df = pd.DataFrame(data['data'])
    df_list.append(post_df)
    
    time.sleep(3)
    
nike_df = pd.concat(df_list)

before updated to: 1632945797
before updated to: 1632614816
before updated to: 1632291073
before updated to: 1631993802
before updated to: 1631746673
before updated to: 1631481724
before updated to: 1631242973
before updated to: 1630937206
before updated to: 1630689596
before updated to: 1630434415


### Looking at the data 

In [5]:
nike_df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,is_gallery,media_metadata,media,media_embed,secure_media,secure_media_embed,poll_data,crosspost_parent,crosspost_parent_list,author_cakeday
0,[],False,SakuraKoyo,,[],,text,t2_btqawol1,False,False,...,,,,,,,,,,
1,[],False,Stezi-69,,[],,text,t2_byfjodcl,False,False,...,,,,,,,,,,
2,[],False,CosmicFlareXL,,[],,text,t2_14krrn,False,False,...,,,,,,,,,,
3,[],False,Sschiggyy,,[],,text,t2_trbgyzw,False,False,...,,,,,,,,,,
4,[],False,jahapahaoajao,,[],,text,t2_3lkhe0dg,False,False,...,,,,,,,,,,


In [6]:
nike_df.shape

(999, 79)

In [7]:
nike_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_richtext', 'link_flair_text_color', 'link_flair_type',
       'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts',
       'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'thumbnail', 'title', 'total_awards_rece

In [8]:
nike_df.isnull().sum().sort_values(ascending=False)

author_flair_css_class         999
author_flair_text              999
author_cakeday                 997
poll_data                      993
crosspost_parent_list          990
crosspost_parent               990
secure_media_embed             975
media_embed                    975
media                          974
secure_media                   974
removed_by_category            918
gallery_data                   794
media_metadata                 779
is_gallery                     777
preview                        491
post_hint                      491
link_flair_text                483
link_flair_template_id         483
link_flair_css_class           483
thumbnail_width                342
thumbnail_height               342
url_overridden_by_dest         267
subreddit_id                     0
subreddit                        0
stickied                         0
selftext                         0
spoiler                          0
send_replies                     0
subreddit_type      

In [9]:
nike_df.dropna(thresh=999, axis=1, inplace=True)

In [10]:
nike_df.isnull().sum().sort_values(ascending=False)

all_awardings                  0
locked                         0
no_follow                      0
num_comments                   0
num_crossposts                 0
over_18                        0
parent_whitelist_status        0
permalink                      0
pinned                         0
pwls                           0
retrieved_on                   0
score                          0
selftext                       0
send_replies                   0
spoiler                        0
stickied                       0
subreddit                      0
subreddit_id                   0
subreddit_subscribers          0
subreddit_type                 0
thumbnail                      0
title                          0
total_awards_received          0
treatment_tags                 0
upvote_ratio                   0
url                            0
whitelist_status               0
media_only                     0
link_flair_type                0
allow_live_comments            0
link_flair

In [11]:
nike_df.shape

(999, 57)

### Building the dataset 

In [12]:
#pay attention to the subreddit, selftext, and title columns 

nike_df[['subreddit', 'selftext', 'title']].head()

Unnamed: 0,subreddit,selftext,title
0,Nike,"I bought a Nike PG5 shoes size 7 online, it’s ...",Should I go half a size up Nike basketball shoes?
1,Nike,,"Check out this ""Colab"" i made with the Jordan ..."
2,Nike,,Thoughts on the upcoming Nike/Jordan releases ...
3,Nike,I cant find anything about when i search it on...,Why are they not selling the Nike Sfb's anymore?
4,Nike,Or do I have to bite the bullet and get them f...,Will there ever be a restock on nike Jordan 1 ...


In [13]:
nike_df['title'].value_counts()

What shoes are these?                                                                                                                                 8
Anyone know what shoes these are?                                                                                                                     4
Real or Reps?                                                                                                                                         2
Saw a pair of Jordan’s but forgot what they were called and can’t find them anymore 🤦‍♀️. I made a sketch from memory. Can anybody recognize them?    2
Nike revolution 5                                                                                                                                     2
                                                                                                                                                     ..
24-7's                                                                                  

### Tokenize the title variable  

In [14]:
tokenizer = RegexpTokenizer(pattern=r'\w+')

nike_df['title_token'] = nike_df['title'].apply(lambda row: tokenizer.tokenize(row.lower()))

nike_df.head()

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_richtext,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,author_premium,awarders,...,subreddit_type,thumbnail,title,total_awards_received,treatment_tags,upvote_ratio,url,whitelist_status,wls,title_token
0,[],False,SakuraKoyo,[],text,t2_btqawol1,False,False,False,[],...,public,self,Should I go half a size up Nike basketball shoes?,0,[],1.0,https://www.reddit.com/r/Nike/comments/q05gho/...,all_ads,6,"[should, i, go, half, a, size, up, nike, baske..."
1,[],False,Stezi-69,[],text,t2_byfjodcl,False,False,False,[],...,public,spoiler,"Check out this ""Colab"" i made with the Jordan ...",0,[],1.0,https://i.redd.it/w0eb0i8z54r71.jpg,all_ads,6,"[check, out, this, colab, i, made, with, the, ..."
2,[],False,CosmicFlareXL,[],text,t2_14krrn,False,False,False,[],...,public,default,Thoughts on the upcoming Nike/Jordan releases ...,0,[],1.0,https://vm.tiktok.com/ZMRTe8MCd/,all_ads,6,"[thoughts, on, the, upcoming, nike, jordan, re..."
3,[],False,Sschiggyy,[],text,t2_trbgyzw,False,False,False,[],...,public,self,Why are they not selling the Nike Sfb's anymore?,0,[],1.0,https://www.reddit.com/r/Nike/comments/q02uzu/...,all_ads,6,"[why, are, they, not, selling, the, nike, sfb,..."
4,[],False,jahapahaoajao,[],text,t2_3lkhe0dg,False,False,False,[],...,public,self,Will there ever be a restock on nike Jordan 1 ...,0,[],1.0,https://www.reddit.com/r/Nike/comments/q01vld/...,all_ads,6,"[will, there, ever, be, a, restock, on, nike, ..."


### Lemmatize the title variable 

In [15]:
lemmatizer = WordNetLemmatizer()

In [16]:
nike_df['title_token'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

0     [should, i, go, half, a, size, up, nike, baske...
1     [check, out, this, colab, i, made, with, the, ...
2     [thought, on, the, upcoming, nike, jordan, rel...
3     [why, are, they, not, selling, the, nike, sfb,...
4     [will, there, ever, be, a, restock, on, nike, ...
                            ...                        
95    [what, are, those, found, them, in, my, baseme...
96                        [new, air, force, 1, stained]
97    [did, nike, discontinue, these, medium, olive,...
98    [should, i, wear, them, or, you, think, the, p...
99                                     [1985, 3, piece]
Name: title_token, Length: 999, dtype: object

### Stem the title variable

In [17]:
p_stemmer = PorterStemmer()

In [18]:
nike_df['title_token'].apply(lambda tokens: [p_stemmer.stem(token) for token in tokens])

0     [should, i, go, half, a, size, up, nike, baske...
1     [check, out, thi, colab, i, made, with, the, j...
2     [thought, on, the, upcom, nike, jordan, releas...
3     [whi, are, they, not, sell, the, nike, sfb, s,...
4     [will, there, ever, be, a, restock, on, nike, ...
                            ...                        
95    [what, are, those, found, them, in, my, baseme...
96                          [new, air, forc, 1s, stain]
97    [did, nike, discontinu, these, medium, oliv, b...
98    [should, i, wear, them, or, you, think, the, p...
99                                      [1985, 3, piec]
Name: title_token, Length: 999, dtype: object

### What the tokens would look like with no stop words

In [19]:
eng_stopwords = stopwords.words('english')
nike_df['title_token'].apply(lambda tokens: [token for token in tokens if token not in eng_stopwords])

0             [go, half, size, nike, basketball, shoes]
1     [check, colab, made, jordan, 1, chicargo, 1985...
2     [thoughts, upcoming, nike, jordan, releases, 2...
3                         [selling, nike, sfb, anymore]
4     [ever, restock, nike, jordan, 1, university, b...
                            ...                        
95                   [found, basement, kinda, like, em]
96                       [new, air, force, 1s, stained]
97    [nike, discontinue, medium, olive, boots, find...
98                  [wear, think, price, go, time, sit]
99                                     [1985, 3, piece]
Name: title_token, Length: 999, dtype: object

### Remerge the title tokens to have a "stripped" object to analyze 

In [20]:
nike_df['title_tokens_merged'] = nike_df['title_token'].apply(lambda token: ' '.join(token))

### Build the final dataset for analysis

In [21]:
nike_df = nike_df[['title_token', 'title_tokens_merged', 'subreddit']]

In [22]:
nike_df

Unnamed: 0,title_token,title_tokens_merged,subreddit
0,"[should, i, go, half, a, size, up, nike, baske...",should i go half a size up nike basketball shoes,Nike
1,"[check, out, this, colab, i, made, with, the, ...",check out this colab i made with the jordan 1 ...,Nike
2,"[thoughts, on, the, upcoming, nike, jordan, re...",thoughts on the upcoming nike jordan releases ...,Nike
3,"[why, are, they, not, selling, the, nike, sfb,...",why are they not selling the nike sfb s anymore,Nike
4,"[will, there, ever, be, a, restock, on, nike, ...",will there ever be a restock on nike jordan 1 ...,Nike
...,...,...,...
95,"[what, are, those, found, them, in, my, baseme...",what are those found them in my basement and k...,Nike
96,"[new, air, force, 1s, stained]",new air force 1s stained,Nike
97,"[did, nike, discontinue, these, medium, olive,...",did nike discontinue these medium olive boots ...,Nike
98,"[should, i, wear, them, or, you, think, the, p...",should i wear them or you think the price will...,Nike


### Save the dataframe to a csv

In [23]:
nike_df.to_csv('datasets/nike_data.csv', index=False)