# Introducing Data Science - Text Mining

## Step 2 - Data Retrieval

In [2]:
# import PRAW and SQLite3 libaries
import praw
import sqlite3

# Set up connection to SQLite database
conn = sqlite3.connect('reddit.db')
c = conn.cursor()

# Execute SQL statements to create topics and comments table
c.execute('''DROP TABLE IF EXISTS topics''')
c.execute('''DROP TABLE IF EXISTS comments''')
c.execute('''CREATE TABLE topics 
             (topicTitle text, topicText text, topicID text, topicCategory text)''')
c.execute('''CREATE TABLE comments
             (commentText text, commentID text, topicTitle text, topicText text, topicID text, topicCategory text)''')

# Create PRAW user agent so we can use Reddit API
user_agent = "Introducing Data Science Book"
r = praw.Reddit(user_agent=user_agent)

# Our list of subreddits we'll draw into out SQLite database
subreddits = ['datascience', 'gameofthrones']
# Maximum number of posts we'll fetch from Reddit per category. Maximum Reddit allows at any single time is also 1000
limit = 1000

ClientException: Required configuration setting 'client_id' missing. 
This setting can be provided in a praw.ini file, as a keyword argument to the `Reddit` class constructor, or as an environment variable.

In [None]:
# Specific fieds of the topic are appended to the list. We only
# use the title and text throughout the exercies but the topic ID
# would be useful for building your own (bigger) database of topics
def prawGetData(limit, subredditName):
    # From subreddits, get hottest 1,000 (in our case) topics
    topics = r.get_subreddit(subredditName).get_host(limit=limit)
    commentInsert = []
    topicInser = []
    topicNBR = 1
    for topic in topics:
        if (float(topicNBR)/limit)*100 in xrange(1,100):
            # This part is an informative print and not necessary
            # for code to work. It only informs you about the
            # download progress
            print '************ TOPIC:' + str(topic.id) +
            ' ************ COMPLETE: ' + str((float(topicNBR)/limit)*100) +
            '% ***'
        topicNBR += 1
        try:
            topicInsert.append((topic.title, topic.selfText, topic.id, subredditName))
        except:
            pass
        try:
            for comment in topic.comments:
                # Append comments to a list. These are not used in
                # the exercise but now you have them for experimentation
                commentInsert.append((comment.body, comment.id, 
                                      topic.title, topic.selfText, 
                                      topic.id, subredditName))
            except:
                pass
    print '**************************************'
    print 'INSERTING DATA INTO SQLITE'
    # Insert all topics into SQLite database
    c.executemany('INSERT INTO topics VALUES (?,?,?,?)', topicInsert)
    print 'INSERTED TOPICS'
    # Insert all comments into SQLite database
    c.executemany('INSERT INTO comments VALUES (?,?,?,?,?,?)', commentInsert)
    print 'INSERTED COMMENTS'
    # Commit changes (data insertions) to database
    # Without the commit, no data will be inserted
    conn.commit()


In [None]:
# The function is executed for all subreddits we specified earlier
for subject in subreddits:
    prawGetData(limit=limit,subredditName=subject)
    

## Step 3 - Data Preparation

In [None]:
# Import all required libraries
import sqlite3
import nltk
import matplotlib.pyplot as plt
from collections import OrderedDict
import random

# Download corora we make use of
nltk.download('punkt')
nltk.download('stopwords')

# Make connection to SQLite database that contains our Reddit data
conn = sqlite3.connect('reddit.db')
c.conn.cursor()

### Word filtering and lowercasing functions

In [None]:
# wordFilter() function will remove a term from an array of terms
def wordFilter(excluded, wordrow):
    filtered = [word for word in wordrow if word not in excluded]
    return filtered

# Stop word variable contains English stop words per default
# present in NTLK
stopwords = nltk.corpus.stopwords.words('english')

# lowerCaseArray() function transforms any term to its lowercased
# version
def lowerCaseArray(wordrow):
    lowercased = [word.lower() for word in wordrow]
    return lowercased


### First data preparation function and execution

In [None]:
def data_processing(sql):
    # Create pointer to SQLite data
    c.execute(sql)
    data = {'wordMatrix': [], 'all_words': []}
    # Fetch data row by row
    row = c.fetchone()
    while row is not None:
        # row[0] is title, row[1] is topic text; we turn them into
        # a single text blob
        wordrow = nltk.tokenize.word_tokenize(row[0] + " " + row[1])
        wordrow_lowercased = lowerCaseArray(wordrow)
        wordrow_nostopwords = wordFilter(stopwords, wordrow_lowercased)
        # We'll use data['all_words'] for data exploration
        data['all_words'].extend(wordrow_nostopwords)
        # data['wordMatrx'] is a matrix comprised of word vectors;
        # 1 vector per document
        data['wordMatrix'].append(wordrow_nostopwords)
        row = c.fetchone()
    return data

# Our subreddits as defined earlier
subreddits = ['datascience', 'gameofthrones']
data = {}

# Call data processing function for every subreddit
for subject in subreddits:
    data[subject] = data_processing(sql='''SELECT topicTitle, topicText, topicCategory 
                                             FROM topics 
                                            WHERE topicCategory = ''' + "'" + subject "'")
    

## Step 4 - Data Exploration

In [None]:
# Lets look at the frequency distribution of our terms
wordfreqs_cat1 = nltk.FreqDist(data['datascience']['all_words'])
plt.hist(wordfreqs_cat1.values(), bins = range(10))
plt.show()
wordfreqs_cat2 = nltk.FreqDist(data['gameofthrones']['all_words'])
plt.hist(wordfreqs_cat2.values(), bins = range(20))
plt.show()

In [None]:
# Plots show that a lot of our terms only appear in one document.
# Single-occurance terms such as these are called hapaxes. They
# add little value to our model and can be removed
print wordfreqs_cat1.hapaxes()
print wordfreqs_cat2.hapaxes()

In [None]:
# Lets look at the most frequent words
print wordfreqs_cat1.most_common(20)
print wordfreqs_cat2.most_common(20)

## Step 3 revisited - Data preparation adapted

In [None]:
# Initialzes stemmer from NLTK library
stemmer = nltk.SnowballStemmer("english")
def wordStemmer(wordrow):
    stemmed = [stemmer.stem(word) for word in wordrow]
    return stemmed

# Stop words array defines terms to remove/ignore
namual_stopwords = [',','.',')','(','m',"'m","n't",'e.g',"'ve",'s',
                   '#','/','``',"'s","''",'!','r',']','=','[','&',
                   '%','...','1','2','3','4','5','6','7','8','9',
                   '10','--',';','-',':',';']

def data_processing(sql, manual_stopwords):
    # Create pointer to SQLite data
    c.execute(sql)
    data = {'wordMatrix': [], 'all_words': []}
    # Fetch data row by row
    row = c.fetchone()
    while row is not None:
        # row[0] is title, row[1] is topic text; we turn them into
        # a single text blob
        wordrow = nltk.tokenize.word_tokenize(row[0] + " " + row[1])
        wordrow_lowercased = lowerCaseArray(wordrow)
        wordrow_nostopwords = wordFilter(stopwords, wordrow_lowercased)
        
        # Remove manually added stopwords from text blob
        wordrow_nostopwords = wordFilter(manual_stopwords,wordrow_nostopwords)
        wordrow_stemmed = wordStemmer(wordrow_nostopwords)
        
        # Temporary word list used to remove hapaxes
        interWordList.extend(wordrow_stemmed)
        # Temporary word matrix; will become final word matrix after hapaxes removal
        interWordMatrix.append(wordrow_stemmed)
        
        # Get new topic
        row = c.fetchone()
    
    # Make frequency distribution of all terms
    wordfreqs = nltk.FreqDist(interWordList)
    hapaxes = wordfreqs.hapaxes()
    # Loop through temporary word matrix
    for wordvector in interWordMatrix:
        # Remove hapaxes in each word vector
        wordvector_nohapaxes = wordFilter(hapaxesmwordvector)
        # Append correct word vector to final word matrix
        data['wordMatrix'].append(wordrow_nostopwords)
        # Extend list of all terms with corrected word vector
        data['all_words'].extend(wordrow_nostopwords)
        
    return data

# Our subreddits as defined earlier
subreddits = ['datascience', 'gameofthrones']
data = {}

# Run new data processing function for both subreddit
for subject in subreddits:
    data[subject] = data_processing(sql='''SELECT topicTitle, topicText, topicCategory 
                                             FROM topics 
                                            WHERE topicCategory = ''' + "'" + subject "'", 
                                    manual_stopwords)
    

### Final data transformation and data splitting before modeling

In [None]:
# Holdout sample will be used to determine the model's flaws by 
# constructing a confusion matrix
holdoutLength = 100

# We create a single data set with every word vector tagged as being
# either 'datascience' or 'gameofthrones'. We keep part of the data
# aside for holdout sample.
labeled_data1 = [(word,'datascience') for word in
                data['datascience']['wordMatrix'][holdoutLength:]]
labeled_data2 = [(word,'gameofthrones') for word in
                data['gameofthrones']['wordMatrix'][holdoutLength:]]
labeled_data = []
labeled_data.extend(labeled_data1)
labeled_data.extend(labeled_data1)

# Holdout sample is comprised of unlabeled data from two subreddits:
# 100 obervations from each data set. The labels are kept in a separate
# data set.
holdout_data = data['datascience']['wordMatrix'][:holdoutLength]
holdout_data.extend(data['gameofthrones']['wordMatrix'][:holdoutLength])
holdout_data_labels = ([('datascience')
                       for _ in xrange(holdoutLength)] + [('gameofthrones') for _ in
                                                         xrange(holdoutLength)])

# A list of all unique terms is created to build the bag of words
# data we need for training or scoring a model
data['datascience']['all_words_dedup'] = list(OrderedDict.fromkeys(data['datascience']['all_words']))
data['gameofthrones']['all_words_dedup'] =list(OrderDict.fromkeys(data['gameofthrones']['all_words']))
all_words = []
all_words.extend(data['datascience']['all_words_dedup'])
all_words.extend(data['gameofthrones']['all_words_dedup'])
all_words_dedup = list(OrderedDict.fromkeys(all_words))

# Data is turned into a binary bag of words format
prepared_data = [({word: (word in x[0]) for word in all_words_dedup}, x[1]) for x in labeled_data]
prepared_holdout_data = [({word: (word in x[0]) for word in all_words_dedup}) for x in holdout_data]

# Data for model training and testing to be shuffled first
random.shuffle(prepared_data)
# Size of training data will be 75% of total and remaining 25% will
# be used for testing model performance
train_size = int(len(prepared_data) * 0.75)
train = prepared_data[:train_size]
test = prepared_data[train_size:]


## Step 5 - Data Analysis

### Naive Bayes classifier

In [None]:
# Lets first test the performance of our Naive Bayes classifier. NLTK
# comes with a classifier
classifier = nltk.NaiveBayesClassifier.train(train)
# With the classifier trained we can use the test data to get a measure
# on overall accuracy
nltk.classify.accuracy(classifiet, test)

In [None]:
# Let's test it again on the 200 observations holdout sample and this
# time create a confusion matrix
classified_data = classifier.classify_many(prepared_holdout_data)
cm = nltk.ConfusionMatrix(holdout_data_labels, classified_data)
print cm

In [None]:
# Let's look at what is uses to determine the categories by digging
# into the most informative
print(classifier.show_most_informative_features(20))

### Decision tree model

In [None]:
# Train decision tree classifier
classifier2 = nltk.DecisionTreeClassifier.train(train)
# Tes classifier accuracy
nltk.classify.accuracy(classifier2, test)
# Attempt to classify holdout data (scoring)
classified_data2 = classifier2.classify_many(prepared_holdout_data)
# Create confusion matrix based on classification results and actual labels
cm = nltk.ConfusionMatrix(holdout_data_labels, classified_data2)
# Show confusion matrix
print cm


On these 200 observations of the holdout sample the decision tree model tends to classify well when the post is about Game of Thrones but fails miserably when confronted with the data science posts. It seems the model has a preference for Game of Thrones, and can you blame it? Let’s have a look at the actual model, even though in this case we’ll use the Naïve Bayes as our final model.

In [None]:
print(classifier2.pseudocode(depth=4))
