In [1]:
import re
import os
import sys
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import accuracy, precision, recall, f_measure
from nltk.corpus import subjectivity

### Step -1: Subjectivity Lexicons Features

In [18]:
# function to read tweet training file, train and test a classifier 
def processtweets(dirPath,limitStr):
  # convert the limit argument from a string to an int
  limit = int(limitStr)
  # initialize NLTK built-in tweet tokenizer
  twtokenizer = TweetTokenizer()
  
  os.chdir(dirPath)
  
  f = open('./downloaded-tweeti-b-dist.tsv', 'r')
  # loop over lines in the file and use the first limit of them
  #    assuming that the tweets are sufficiently randomized
  tweetdata = []
  for line in f:
    if (len(tweetdata) < limit):
      # remove final end of line character
      line = line.strip()
      # each line has 4 items separated by tabs
      # ignore the tweet and user ids, and keep the sentiment and tweet text
      tweetdata.append(line.split('\t')[2:4])
  
  #for tweet in tweetdata[:10]:
    #print (tweet)
  
  # create list of tweet documents as (list of words, label)
  # where the labels are condensed to just 3:  'pos', 'neg', 'neu'
  tweetdocs = []
  # add all the tweets except the ones whose text is Not Available
  for tweet in tweetdata:
    if (tweet[1] != 'Not Available'):
        
      # run the tweet tokenizer on the text string - returns unicode tokens, so convert to utf8
      tokens = twtokenizer.tokenize(tweet[1])
      
      #Handling Emoticons
      tokens = [re.sub(':\)','happy', token) for token in tokens]
      tokens = [re.sub(':\(','sad', token) for token in tokens]
        
      #Removing URLS
      tokens = [re.sub(r"http\S+",'',token) for token in tokens]
      
      #Removing mentions
      tokens = [re.sub(r"@\S+", "", token) for token in tokens]
    
      # Convert to lowercase
      tokens = [token.lower() for token in tokens]
      
      # Remove punctuation
      tokens = [token for token in tokens if token not in string.punctuation]
      
      #Remove words with numbers (eg 11th)
      tokens = [re.sub(r'\S*\d\S*', '', token).strip() for token in tokens]
      tokens = [token for token in tokens if token]
    
      def decontracted(phrase):
    
         # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
      
      tokens = [decontracted(token) for token in tokens]
    
      # Remove stopwords
      with open('stopwords_twitter.txt', 'r') as file:
        stop_words = set(file.read().splitlines())
      tokens = [token for token in tokens if token not in stop_words]
      
      # Lemmatization
      lemmatizer = WordNetLemmatizer()
      tokens = [lemmatizer.lemmatize(token) for token in tokens]
      
      
        
      if tweet[0] == '"positive"':
        label = 'pos'
      else:
        if tweet[0] == '"negative"':
          label = 'neg'
        else:
          if (tweet[0] == '"neutral"') or (tweet[0] == '"objective"') or (tweet[0] == '"objective-OR-neutral"'):
            label = 'neu'
          else:
            label = ''
      tweetdocs.append((tokens, label))
  
  # print a few
 # for tweet in tweetdocs[:10]:
    #print (tweet)
  #Saving the cleaned data for EDA (one time procedure)
  df = pd.DataFrame(tweetdocs, columns=['tokens','labels'])
  df.to_csv('tweetdocs1.csv',index=False)
    
    
#----------------------------------------------------------------------------------------------------------------------#  
  # continue as usual to get all words and create word features
    

  # feature sets from a feature definition function

  # Function to load subjectivity lexicon from file
  def load_subjectivity_lexicon(file_path):
    lexicon = {}
    
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('type=weaksubj') or line.startswith('type=strongsubj'):
                fields = line.strip().split()
                word = fields[2].split('=')[1]
                polarity = fields[0].split('=')[1]
                
                lexicon[word] = polarity
    
    return lexicon

  # Load subjectivity lexicon from file
  lexicon_file = './subjclueslen1-HLTEMNLP05.tff'
  subjectivity_lexicon = load_subjectivity_lexicon(lexicon_file)

  # Function to generate subjectivity lexicon features
  def subjectivity_lexicon_features(tokens):
    # Create a dictionary to store the subjectivity labels
    subjectivity_labels = {}
    
    # Iterate over the tokens
    for token in tokens:
        # Check if the token is present in the subjectivity lexicon
        if token in subjectivity_lexicon:
            # Assign the polarity label from the lexicon
            polarity = subjectivity_lexicon[token]
            
            # Assign a subjectivity label based on polarity
            subjectivity_labels[token] = True if polarity == 'positive' else False
        else:
            # Assign a subjectivity label of False for tokens not in the lexicon
            subjectivity_labels[token] = False
    
    return subjectivity_labels

  # List to store feature sets
  feature_sets = []

  # Iterate over each entry
  for entry in tweetdocs:
    tokens = entry[0]  # Extract tokens from the entry
    label = entry[1]  # Extract label from the entry
    
    # Generate subjectivity lexicon features for the tokens
    features = subjectivity_lexicon_features(tokens)
    
    # Create a tuple of feature dictionary and label
    feature_set = (features, label)
    
    # Add the feature set to the list
    feature_sets.append(feature_set)


  print('length of feature set: ',len(feature_sets))
#-----------------------------------------------------------------------------------------------------------------------#
    # train and test a classifier
  train_set = feature_sets[:7000]
  test_set = feature_sets[7000:]
    
  nb_classifier = NaiveBayesClassifier.train(train_set)
  print('Accuracy: ',nltk.classify.accuracy(nb_classifier,test_set))
  
  # Precision, Recall, and F1-score
  refsets = collections.defaultdict(set)
  testsets = collections.defaultdict(set)

  for i, (features, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = nb_classifier.classify(features)
    testsets[observed].add(i)

  prec = precision(refsets['pos'], testsets['pos'])
  rec = recall(refsets['pos'], testsets['pos'])
  f1 = f_measure(refsets['pos'], testsets['pos'])

  print("Precision:", prec)
  print("Recall:", rec)
  print("F1 Score:", f1)
    
  # show most informative features'''
  print(nb_classifier.most_informative_features(25))

In [19]:
tweetdocs = processtweets('.',10000)

length of feature set:  8208
Accuracy:  0.5587748344370861
Precision: 0.682
Recall: 0.6361940298507462
F1 Score: 0.6583011583011583
[('fuck', False), ('excited', False), ('sorry', False), ('amazing', False), ('fun', False), ('luck', False), ('sad', False), ('great', False), ('injury', False), ('awesome', False), ('thank', False), ('fucking', False), ('cancelled', False), ('thanks', False), ('anymore', False), ('bitch', False), ('cant', False), ('happy', False), ('cry', False), ('suck', False), ('could not', False), ('matter', False), ('exciting', False), ('alone', False), ('missing', False)]
