## Imports 

In [2]:
# Necessary libraries and modules we will use 
import requests
import numpy as np
import pandas as pd
import time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from bs4 import BeautifulSoup

#Import for setting scrape rate
from time import sleep
from random import randint

## API Call

In [3]:
# Using the pushshift url 
url = 'https://api.pushshift.io/reddit/search/submission'

In [12]:
# Creating our first params
params = {'subreddit': 'houseplants', 'size':100}

In [13]:
res = requests.get(url, params)

In [14]:
res.status_code

200

In [15]:
# Turning our request data into json use as python dictionary
data = res.json()

In [16]:
posts = data['data']

In [17]:
len(posts)

100

### Scrape the posts sequentially 

In [19]:
# using a for loop to scrape posts 
for count in range(50): 
    sleep(2)
    
    # stops duplicates by grabbing last post
    utc = posts[-1]['created_utc'] 
    
    params = {
    'subreddit' : 'houseplants',
    'size' : 100,
    'before' : utc}
    
    data = requests.get(url, params)
    data = data.json()['data']
    posts = posts + data

In [20]:
# checking the amount of posts 
len(posts)

5100

In [22]:
#Create dataframe from posts
df_houseplants = pd.DataFrame(posts)

#Isolate dataframe for desired features
df_houseplants = df_houseplants[['subreddit', 'title', 'selftext']]

In [23]:
df_houseplants.head()

Unnamed: 0,subreddit,title,selftext
0,houseplants,"Not a fan of any Calathea, but this 'Shine Sta...",
1,houseplants,My new marble queen pothos! Welcome to the fam...,
2,houseplants,Moisture meter says it's @ 9. Got it from Cost...,
3,houseplants,Made a couple of trellises today!,
4,houseplants,Made a couple of trellises tonight!,[deleted]


In [24]:
df_houseplants.isnull().sum()

subreddit    0
title        0
selftext     2
dtype: int64

In [27]:
# dropping null values since it is only 2 posts
df_houseplants.dropna(inplace=True)

In [29]:
df_houseplants.shape

(5098, 3)

In [30]:
# saving the DF to a csv
df_houseplants.to_csv('houseplants.csv', index=False)

In [2]:
# opening the csv to view 
df = pd.read_csv('houseplants.csv')

In [3]:
df.head()

Unnamed: 0,subreddit,title,selftext
0,houseplants,"Not a fan of any Calathea, but this 'Shine Sta...",
1,houseplants,My new marble queen pothos! Welcome to the fam...,
2,houseplants,Moisture meter says it's @ 9. Got it from Cost...,
3,houseplants,Made a couple of trellises today!,
4,houseplants,Made a couple of trellises tonight!,[deleted]


In [9]:
df.dtypes

subreddit    object
title        object
selftext     object
dtype: object

In [10]:
df[df['selftext'] == '[deleted]']

Unnamed: 0,subreddit,title,selftext
4,houseplants,Made a couple of trellises tonight!,[deleted]
5,houseplants,Made a couple of trellises tonight!!,[deleted]
15,houseplants,Update: took some advice from reddit and repot...,[deleted]
23,houseplants,Obsessed with my new plant/pot combo,[deleted]
43,houseplants,My mom picked these up for me at home in MI an...,[deleted]
...,...,...,...
4303,houseplants,Some of my friends:),[deleted]
4304,houseplants,My little aquarium full of babies and some ove...,[deleted]
4866,houseplants,I HAD NO IDEA THIS PLANT FLOWERS! I was gettin...,[deleted]
4868,houseplants,I HAD NO IDEA THIS PLANT FLOWERED! I was getti...,[deleted]


In [11]:
df.shape

(5098, 3)

In [12]:
df.iloc[0]

subreddit                                          houseplants
title        Not a fan of any Calathea, but this 'Shine Sta...
selftext                                                   NaN
Name: 0, dtype: object

In [17]:
df['selftext'].sort_values()

4481    \n\n[View Poll](https://www.reddit.com/poll/ln...
4190    \n\n[View Poll](https://www.reddit.com/poll/lo...
3988    \n\n[View Poll](https://www.reddit.com/poll/lo...
3361    \n\n[View Poll](https://www.reddit.com/poll/lp...
608     \n\n[View Poll](https://www.reddit.com/poll/lt...
                              ...                        
5093                                                  NaN
5094                                                  NaN
5095                                                  NaN
5096                                                  NaN
5097                                                  NaN
Name: selftext, Length: 5098, dtype: object

## Interior Design Subreddits

In [7]:
url = 'https://api.pushshift.io/reddit/search/submission'

In [8]:
params = {'subreddit': 'InteriorDesign', 'size':100}

In [9]:
res = requests.get(url, params)

In [10]:
res.status_code

200

In [12]:
data = res.json()

In [13]:
posts = data['data']

In [15]:
len(posts)

5100

In [14]:
for count in range(50): 
    sleep(2)
    
    utc = posts[-1]['created_utc'] 
    
    params = {
    'subreddit' : 'InteriorDesign',
    'size' : 100,
    'before' : utc}
    
    data = requests.get(url, params)
    data = data.json()['data']
    posts = posts + data

In [16]:
len(posts)

5100

In [18]:
#Create dataframe for posts
df_id = pd.DataFrame(posts)

#Isolate dataframe for desired features
df_id = df_id[['subreddit', 'title', 'selftext']]

In [19]:
df_id

Unnamed: 0,subreddit,title,selftext
0,InteriorDesign,Marks on newly painted dark wall,
1,InteriorDesign,"Coral Castle Hallway. Travertine floor, Coral ...",
2,InteriorDesign,UK-Based Interior Spray Painter,[removed]
3,InteriorDesign,Does anyone know where to get a decent modular...,[removed]
4,InteriorDesign,Which size rug should i go with for my living ...,
...,...,...,...
5095,InteriorDesign,Narrow living room help please!,
5096,InteriorDesign,6 Office Design Trends For the Post Covid-19 W...,
5097,InteriorDesign,Angolo telefono,
5098,InteriorDesign,Looking for a similar designed media console t...,[removed]


In [20]:
# checking for missing values
df_id.isnull().sum()

subreddit     0
title         0
selftext     30
dtype: int64

In [21]:
# dropping the 30 missing values in self text
df_id.dropna(inplace=True)

In [22]:
df_id.shape

(5070, 3)

In [23]:
# saving to csv to open in another notebook 
df_id.to_csv('interior_design.csv', index=False)