## Naive Bayes Text Classification ##

Tha following program applies the Naive Bayes classifier provided by NLTK to input data files.

In [None]:
# Author: Elizabeth Brooks
# Date Modified: 07/06/2015

# PreProcessor Directives
import os
import sys
sys.path.append(os.path.realpath('../'))
import csv
import yaml
import re
from nltk.classify import apply_features
import random
# Directives for twc yaml
import twittercriteria as twc
twc.loadCriteria()
twc.clearCriteria()

# Global field declarations
keyword = twc.getKeywordRegex()
current_dir = os.getcwd()
# Set the output file path
relevantPath = current_dir + '/RelevantTweets.txt'
irrelevantPath = current_dir + '/IrrelevantTweets.txt'
# Initialize the training and dev data sets
trainSet, devSet = []

# Function to clean up tweet strings 
# by manually removing irrelevant data (not words)
def cleanUpTweet(tweet_text):
    # Irrelevant characters
    twitterMarkup = ['&amp;', 'http://t.co/']
    temp = tweet_text.lower()
    # Use regex to create a regular expression 
    # for removing undesired characters
    temp = re.sub('|'.join(twitterMarkup), r"", temp)
    return temp
# End cleanUpTweet

# Function to search for tweets based on pre-determined key words
def containsKeyword(tweet_text):
    return keyword.search(tweet_text) is not None
# End containsKeyword

The above function removes irrelevant characters from the tweet strings contained in the Twitter data csv files.

In [None]:
# Function for organizing data for labeling
def createRelevanceDict(txt_data):
    # Create object for writting to a text file
    relevantTxtFile = open(relevantPath, "w")
    irrelevantTxtFile = open(irrelevantPath, "w")
    # Write the tweets sorted to their respective files
    with open(txt_data + ".csv") as fileData:
        for line in fileData:
            # Determie the label to assign a tweet based on keyword
            if containsKeyword(line):
                relevantTxtFile.write(line + "\n")
            else:
                irrelevantTxtFile.write(line + "\n")
            # End else
    # Close the files
    relevantTxtFile.close()
    irrelevantTxtFile.close()
    # Inititialize dictionary classes
    initDictSet()
# End createRelevanceDict

# Function to initialize the feature sets
def initDictSet():
    # Create object for writting to a text file
    relevantTxtFile = open(relevantPath, "w")
    irrelevantTxtFile = open(irrelevantPath, "w")
    # Assign labels to tweets
    # Two classes, relevant and irrelevant to the marathon
    labeledTweets = ([(word, 'relevant') for word in relevantTxtFile.read().split()] +
        [(word, 'irrelevant') for word in irrelevantTxtFile.read().split()])
    # Randomize the data
    random.shuffle(labeledTweets)
    # Close the files
    relevantTxtFile.close()
    irrelevantTxtFile.close()
# End initDictSet


The above functions are used to first organize the data into text file by relevance accoring to the pre determined list of keyword, then a second function loads these tweets by word into a yaml to be used to build classes and identify tweets for classification.

In [None]:
# Function to extract features from tweets
def extractFeatures(train_file, test_file):
    # Iterate through the Twitter data csv files by tweet text
    with open(current_dir + '/../' + train_file + '.csv') as csvfile:  
        tweetIt = csv.DictReader(csvfile)
        # Retrieve terms in tweets
        for twitterData in tweetIt:
            # Send the tweet text to the function for removing unncessary characters
            tweetText = cleanUpTweet(twitterData['tweet_text'])
            # Determine the feature sets
            featureSets = [tweetText, relevance) for (tweetText, relevance) in labeledTweets]
        # End for
    # End with
    # Train the determined feature set
    trainClassifyData(featureSets, test_file)
# End extractFeatures

# Function for training the classifier
def trainClassifyData(feature_sets, test_file):   
    # Establish the training and dev data sets
    trainSet, devSet = feature_sets[500:], features_sets[:500] #before and after 500

    # Train the Naive Bayes (NB) classifier
    classifierNB = nltk.NaiveBayesClassifier.train(trainSet)
    
    # Classify input test data
    # Set the results file path
    resultsFilePath = current_dir + '/ClassifierResults_NB.txt'
    # Create object for writting to a text file
    tweetResultsFile = open(resultsFilePath, "w")
    # Iterate through the Twitter data csv files by tweet text
    with open(current_dir + '/../' + test_file + '.csv') as csvfile:  
        tweetIt = csv.DictReader(csvfile)
        # Retrieve terms in tweets
        for twitterData in tweetIt:
            # Send the tweet text to the function for removing unncessary characters
            tweetText = cleanUpTweet(twitterData['tweet_text'])
            # Send the results of the classifier to a txt file
            tweetResultsFile.write(classifierNB.classify(tweetText))
        # End for
    # End with
    # Close file
    tweetResultsFile.close()
# End trainClassifyData

The extractFeatures(tweet_txt) function is used by the trainData() function to assign an input term to a feature set indicating marathon relevance. The feature set is then split into a training and test set. Then the training set is the used by the Naive Bayes classifier provided by NLTK to train the object.

In [None]:
# The main method
def main():
    # Set the output file path
    txtFilePath = current_dir + '/OutputTweets.txt'
    # Create object for writting to a text file
    tweetTxtFile = open(txtFilePath, "w")
    
    # Request user input of the file name of test data to be processed
    inputFile = raw_input("Enter csv file name...\nEx: cleaned_geo_tweets_Apr_12_to_22")

    # Iterate through the Twitter data csv files by tweet text
    with open(current_dir + '/../' + inputFile + '.csv') as csvfile:  
        tweetIt = csv.DictReader(csvfile)
        # Retrieve the strings of tweets
        for twitterData in tweetIt:
            # Convert tweets to lower case to pool words of the same spelling
            # Send the tweet text to the function for removing unncessary characters
            tweetText = cleanUpTweet(twitterData['tweet_text'])
            # Write the selected Twitter data, tweets, to the txt file
            tweetTxtFile.write(tweetText + "\n")
    # Close the file obj
    tweetTxtFile.close()
    # Organize the data by relevance according to keyword dictionary
    createRelevanceDict(txtFilePath)
    
    # Request user input of the file name of train/dev data to be processed
    inputTrainFile = raw_input("Enter train/dev data set csv file name...\nEx: cleaned_geo_tweets_Apr_12_to_22")
    # Request file name of data to be classified
    inputTestFile = raw_input("Enter test data set csv file name...\nEx: cleaned_geo_tweets_Apr_12_to_22")
    
    # Train the NB classifier using input tweet terms
    extractFeatures(inputTrainFile, inputTestFile)
# End main

# Run the script via the main method
if __name__ == "__main__":
    main()
    
# End script