# Getting the Nike data from the reddit subreddit r/Nike 

In [1]:
import requests 
import time 
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [None]:
pd.set_option("display.max_rows", 101) # in order to display all rows

In [None]:
# set the parameters outside so I dont have to hardcode in the for loop 
# gets all posts from reddit in general 
# also set the before parameter with first created_utc found in initial call of the submissions 

url = 'https://api.pushshift.io/reddit/search/submission'
subreddit = 'Nike'
before = 1633216217

df_list = []

In [None]:
for _ in range (10):
    params = {
        'subreddit': subreddit, 
        'size': 100,
        'before': before
        }
    res = requests.get(url, params= params)
    data = res.json()
    
    before = data['data'][-1]['created_utc']
    print(f'before updated to: {before}')
    
    post_df = pd.DataFrame(data['data'])
    df_list.append(post_df)
    
    time.sleep(3)
    
nike_df = pd.concat(df_list)

### Looking at the data 

In [None]:
nike_df.head()

In [None]:
nike_df.shape

In [None]:
nike_df.columns

In [None]:
nike_df.isnull().sum().sort_values(ascending=False)

In [None]:
nike_df.dropna(thresh=999, axis=1, inplace=True)

In [None]:
nike_df.isnull().sum().sort_values(ascending=False)

In [None]:
nike_df.shape

### Building the dataset 

In [None]:
#pay attention to the subreddit, selftext, and title columns 

nike_df[['subreddit', 'selftext', 'title']].head()

In [None]:
nike_df['title'].value_counts()

### Tokenize the title variable  

In [None]:
tokenizer = RegexpTokenizer(pattern=r'\w+')

nike_df['title_token'] = nike_df['title'].apply(lambda row: tokenizer.tokenize(row.lower()))

nike_df.head()

### Lemmatize the title variable 

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
nike_df['title_token'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])

### Stem the title variable

In [None]:
p_stemmer = PorterStemmer()

In [None]:
nike_df['title_token'].apply(lambda tokens: [p_stemmer.stem(token) for token in tokens])

### What the tokens would look like with no stop words

In [None]:
eng_stopwords = stopwords.words('english')
nike_df['title_token'].apply(lambda tokens: [token for token in tokens if token not in eng_stopwords])

### Remerge the title tokens to have a "stripped" object to analyze 

In [None]:
nike_df['title_tokens_merged'] = nike_df['title_token'].apply(lambda token: ' '.join(token))

### Build the final dataset for analysis

In [None]:
nike_df = nike_df[['title_token', 'title_tokens_merged', 'subreddit']]

In [None]:
nike_df

### Save the dataframe to a csv

In [None]:
nike_df.to_csv('datasets/nike_data.csv', index=False)