## Naive Bayes Text Classification ##

Tha following program applies the Naive Bayes classifier provided by NLTK to input data files to determine the Twitter data set's relevancy to the Boston marathon bombing of 2013.

In [None]:
# Author: Elizabeth Brooks
# Date Modified: 07/08/2015

# PreProcessor Directives
import os
import sys
sys.path.append(os.path.realpath('../'))
import csv
import yaml
import re
from nltk.classify import apply_features
import random
# Directives for twc yaml
import twittercriteria as twc
twc.loadCriteria()
keyword = twc.getKeywordRegex()
twc.clearCriteria()

# Global field declarations
current_dir = os.getcwd()
# Set the output file path
resultsPath = current_dir + '/relevantTweetResults.txt'
# Initialize the training and dev data sets
trainSet, devSet, labeledTweets, featureSets = []

# Function to clean up tweet strings 
# by manually removing irrelevant data (not words)
def cleanUpTweet(tweet_text):
    # Irrelevant characters
    twitterMarkup = ['&amp;', 'http://t.co/']
    temp = tweet_text.lower()
    # Use regex to create a regular expression 
    # for removing undesired characters
    temp = re.sub('|'.join(twitterMarkup), r"", temp)
    return temp
# End cleanUpTweet
## The markup and the cleanUpTweet function will eventually 
## be moved to twc

# Function to search for tweets based on pre-determined key words
def tweetHasAKeyword(tweet_text):
    return keyword.search(tweet_text) is not None
# End tweetHasAKeyword


The following function creates a dictionary of relevent features

In [None]:
# Function to initialize the feature sets
def initDictSet(class1_path, class2_path):
    # Loop through the txt files line by line
    # Assign labels to tweets
    # Two classes, relevant and irrelevant to the marathon
    with open(current_dir + class1_path, "r") as relevantFile:
        for line in relevantFile:
            for word in line.split():
                labeledTweets.append(word, 'relevant')
    with open(current_dir + class2_path, "r") as irrelevantFile:
        for line in irrelevantFile:
            for word in line.split():
                labeledTweets.append(word, 'irrelevant')
    # Randomize the data
    random.shuffle(labeledTweets)
    # Close the files
    relevantTxtFile.close()
    irrelevantTxtFile.close()
# End initDictSet


The extractFeatures(tweet_txt) function is used by the trainData() function to assign an input term to a feature set indicating marathon relevance. The feature set is then split into a training and test set. Then the training set is the used by the Naive Bayes classifier provided by NLTK to train the object.

In [None]:
# Function to extract features from tweets
def extractFeatures(train_file, test_file):
    # Iterate through the Twitter data csv files by tweet text
    with open(current_dir + '/../' + train_file + '.csv') as csvfile:  
        tweetIt = csv.DictReader(csvfile)
        # Retrieve terms in tweets
        for twitterData in tweetIt:
            # Send the tweet text to the function for removing unncessary characters
            tweetText = cleanUpTweet(twitterData['tweet_text'])
            # Determine the feature sets
            featureSets = [(tweetText, relevance) for (tweetText, relevance) in labeledTweets]
        # End for
    # End with
    # Train the determined feature set
    trainClassifyData(featureSets, test_file)
# End extractFeatures

# Function for training the classifier
def trainClassifyData(feature_sets, test_file):   
    # Establish the training and dev data sets
    trainSet, devSet = feature_sets[500:], features_sets[:500] #before and after 500

    # Train the Naive Bayes (NB) classifier
    classifierNB = nltk.NaiveBayesClassifier.train(trainSet)
    
    # Classify input test data
    # Create object for writting to a text file
    tweetResultsFile = open(resultsPath, "w")
    # Iterate through the Twitter data csv files by tweet text
    with open(current_dir + '/../' + test_file + '.csv') as csvfile:  
        tweetIt = csv.DictReader(csvfile)
        # Retrieve terms in tweets
        for twitterData in tweetIt:
            # Send the tweet text to the function for removing unncessary characters
            tweetText = cleanUpTweet(twitterData['tweet_text'])
            # Send the results of the classifier to a txt file
            tweetResultsFile.write(classifierNB.classify(tweetText))
        # End for
    # End with
    # Close file
    tweetResultsFile.close()
# End trainClassifyData

The main method requests user input of class feature sets for Naive Bayes classification of tweets, as well as traing and test data sets of csv Twitter data.

In [None]:
# The main method
def main():
    # Request user input of text class files
    inputClassFile1 = 'relevantTraining.txt'
    inputClassFile2 = 'irrelevantTraining.txt'

    # Initialize the classifier dictionary based on relevant features
    initDictSet(inputClassFile1, inputClassFile2)

    # Request user input of the file name of train/dev data to be processed
    inputTrainFile = raw_input("Enter train/dev data set csv file name...\nEx: cleaned_geo_tweets_Apr_12_to_22")
    # Request file name of data to be classified
    inputTestFile = raw_input("Enter test data set csv file name...\nEx: cleaned_geo_tweets_Apr_12_to_22")
    
    # Train the NB classifier using input tweet terms
    extractFeatures(inputTrainFile, inputTestFile)
# End main

# Run the script via the main method
if __name__ == "__main__":
    main()
    
# End script