In [1]:
from IPython import display
import math
from pprint import pprint
import pandas as pd
import numpy as np
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid', context='talk', palette='Dark2')

Here we are using a Reddit API wrapper, called `praw`, to loop through the /r/politics subreddit headlines.

In [2]:
import praw

In [3]:
reddit = praw.Reddit(client_id='6uiEVMTD9IexgyD_Fd8fvw',
                     client_secret='_Kpy5-BXm1CV5sQUbLJ1QLBjiGL8BQ',
                     user_agent='ashiqurrahman2205')

In [4]:
headlines = set()
#a set for our headlines so we don't get duplicates when running multiple times

In [None]:
#we iterating through the /r/politics subreddit using the API client
for submission in reddit.subreddit('politics').new(limit=None):
    headlines.add(submission.title)
    display.clear_output()
    print(len(headlines))

We're iterating over the new posts in /r/politics, and by adding the limit to None we can get up to 1000 headlines. 

This time we only received 961 headlines.

Without some more advanced tricks we can't go past 1000 results since Reddit cuts off at that point. We can run this loop multiple times and keep adding new headlines to our set, or we can implement a streaming version.

## Labeling our Data

NLTK’s built-in Vader Sentiment Analyzer will give rank a piece of text as positive, negative or neutral using a lexicon of positive and negative words.

We can utilize this tool by first creating a Sentiment Intensity Analyzer- SIA to categorize our headlines, then we'll use the polarity_scores method to get the sentiment.

In [None]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

sia = SIA()
results = []

for line in headlines:
    pol_score = sia.polarity_scores(line)
    pol_score['headline'] = line
    results.append(pol_score)

pprint(results[:3], width=100)

In [None]:
df = pd.DataFrame.from_records(results)
df.head()

The above table consists of four columns from the sentiment scoring: Neural, Negative, Positive and compound. 
The first three represent the sentiment score percentage of each category in our headline, and the compound single number that scores the sentiment.

In [None]:
df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
df.head()

# We are creating a positive label of 1 if the compound is greater than 0.2, and a label of -1 if compound is less than -0.2. Everything else will be 0.

In [11]:
df2 = df[['headline', 'label']]
df2.to_csv('reddit_headlines_labels.csv', mode='a', encoding='utf-8', index=False)

# Dataset Info and Statistics

In [None]:
print("Positive headlines:\n")
pprint(list(df[df['label'] == 1].headline)[:5], width=200)

print("\nNegative headlines:\n")
pprint(list(df[df['label'] == -1].headline)[:5], width=200)

In [None]:
print(df.label.value_counts())

print(df.label.value_counts(normalize=True) * 100)

In [None]:
fig, ax = plt.subplots(figsize=(8, 8))

counts = df.label.value_counts(normalize=True) * 100

sns.barplot(x=counts.index, y=counts, ax=ax)

ax.set_xticklabels(['Negative', 'Neutral', 'Positive'])
ax.set_ylabel("Percentage")

plt.show()

In [None]:
import nltk
nltk.download('punkt')

In [None]:


from nltk.tokenize import word_tokenize, RegexpTokenizer

example = "This is an example sentence! However, it isn't a very informative one"

print(word_tokenize(example, language='english'))

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
tokenizer.tokenize(example)

In [None]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
print(stop_words[:20])

# Now, we will send the CSV file of our scraped data to Azure blob storage