## Import libraries

In [4]:
import pandas as pd 
import time  
import requests 
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer 
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer  
from nltk.corpus import stopwords  
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
from sklearn.model_selection import train_test_split, GridSearchCV 
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re

#### Define a function to get posts from Reddit website (5000 posts, 50 posts each round, 100 rounds)

In [5]:
def get_posts(subreddit):
    before = None 
    posts=[] #list to save the posts
    for i in range (100) : #100 rounds, each round we got 50 posts
#define the parameters, the name of subreddit, number of posts, the time to bring the posts posted before it, the sorting mehtod
        params = { 'subreddit': subreddit, 'size' : 50, 'before' : before, 'sort':'desc' , 'sort_type':'created_utc'} 
#get 50 posts that's been created  before "before" data and sort them from the newest to the latest posts
        req = requests.get("https://api.pushshift.io/reddit/search/submission",params) 
        mine = req.json()['data']
        df = pd.DataFrame.from_dict(mine) #save the posts in a dataframe
        posts.append(df) #add the 50 post to the posts list
        for post in mine: # to get the created data of the oldest post over the 50 posts
#set before to the data of the oldest posts to make sure that each time we get 50 different posts from the last time
            before = post ['created_utc'] 
        time.sleep(2) #wait for 2 seconds before fetching another 50 posts
    group_posts = pd.concat(posts, sort=False) # save the 5000 posts in a dataframe 
    return group_posts

#### Get 5000 different post from nutrition subreddit and 5000 different posts from keto subreddit

In [6]:
group1 = get_posts('nutrition')
#to make sure we got 5000 posts 
group1 .shape 

(5000, 74)

In [10]:
# to make sure we got 5000 different posts (each post has it is own id)
len(group1['id'].unique()) 

5000

In [7]:
group2 = get_posts('keto')
group2 .shape 

(5000, 77)

In [8]:
len(group2['id'].unique())

5000

#### Merge the two group together in one dataframe and extract only the needed columns from the dictionary

In [15]:
SUBFIELDS = ['id','title', 'selftext', 'subreddit'] # we only need the post id, titile, selftext and subreddit

Final_group = [group1, group2] #merge the twp groups 
Final_group = pd.concat(Final_group)
Final_group = Final_group[SUBFIELDS] #extract the needed features only

In [16]:
#check the head of the final_group
Final_group .head(10) 

Unnamed: 0,id,title,selftext,subreddit
0,q4dgzo,Vitamin B differences,Are there any noticeable differences between V...,nutrition
1,q4cvw0,Does a high protein diet cause weight loss?,[removed],nutrition
2,q4chsw,"Irritable Bowel Syndrome: Causes, Symptoms, Me...","Frequently abdominal pain, bloating, diarrhea ...",nutrition
3,q4bo66,Can anyone help me with figuring out how to re...,[removed],nutrition
4,q4baia,Questions about Soaking/Sprouting Grains and F...,My head is bogged down with conflicting pieces...,nutrition
5,q47q5j,I haven’t eaten for a few days (advice),[removed],nutrition
6,q47eb1,How much white rice per day can one safely con...,[removed],nutrition
7,q468hg,How much protein can the body absorb over seve...,If someone were to practice intermittent fasti...,nutrition
8,q465dy,How much protein can your body absorb within s...,[removed],nutrition
9,q45d1g,Has anyone tried Lyma Life supplements?,"Hello everyone, I wanted to ask if anyone of y...",nutrition


In [17]:
#check the tail of the final group
Final_group.tail(10) 

Unnamed: 0,id,title,selftext,subreddit
40,otd2pp,Tryptophan supplement on keto + IF,I have a question about taking tryptophan supp...,keto
41,otcrwg,Best Coffee Creamer (imo),My new obsession for coffee creamer is: 1 tbsp...,keto
42,otcnu0,Is it possible to smell your own ketones?,[removed],keto
43,otcnsw,Random foods and ingredients taste sweeter?,Hello!! I’m fairly new to keto and have been o...,keto
44,otcmu6,How do you guys get your protein each day?,[removed],keto
45,otcav1,Low Body Temperature Anyone?,So I have been on keto (on w/ breaks) for abou...,keto
46,otc7nr,How to start keto?,Are there any good resources on how to get sta...,keto
47,otbdrl,To whoever recommended a pinch of salt and cin...,[removed],keto
48,otbdcl,Keto help :(,Hello everyone!\n\nMy fiance is diabetic and h...,keto
49,otbc47,Continue taking bhb after reaching ketosis?,[removed],keto


In [20]:
# add a lable 1 to the keto group and 0 to the nutrition group 
Final_group['label'] = Final_group['subreddit'].map(lambda x: 1 if x == 'keto'else 0) 

In [21]:
Final_group.head() ##check the final group head after adding the lables 

Unnamed: 0,id,title,selftext,subreddit,label
0,q4dgzo,Vitamin B differences,Are there any noticeable differences between V...,nutrition,0
1,q4cvw0,Does a high protein diet cause weight loss?,[removed],nutrition,0
2,q4chsw,"Irritable Bowel Syndrome: Causes, Symptoms, Me...","Frequently abdominal pain, bloating, diarrhea ...",nutrition,0
3,q4bo66,Can anyone help me with figuring out how to re...,[removed],nutrition,0
4,q4baia,Questions about Soaking/Sprouting Grains and F...,My head is bogged down with conflicting pieces...,nutrition,0


**we are going to pridect the subreddit that post came from baseed in the post title. we will not use the self text because it might contain image and video and it will be missing data.**

In [22]:
Final_group.to_csv('./Final_group2.csv', index=False) ##save the data to a csv file 