In [14]:
import re
import os
import sys
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier
from nltk.metrics import accuracy, precision, recall, f_measure
from nltk import pos_tag
from afinn import Afinn

### Step -1: AFINN Sentiment Lexicon

In [26]:
# function to read tweet training file, train and test a classifier 
def processtweets(dirPath,limitStr):
  # convert the limit argument from a string to an int
  limit = int(limitStr)
  # initialize NLTK built-in tweet tokenizer
  twtokenizer = TweetTokenizer()
  
  os.chdir(dirPath)
  
  f = open('./downloaded-tweeti-b-dist.tsv', 'r')
  # loop over lines in the file and use the first limit of them
  #    assuming that the tweets are sufficiently randomized
  tweetdata = []
  for line in f:
    if (len(tweetdata) < limit):
      # remove final end of line character
      line = line.strip()
      # each line has 4 items separated by tabs
      # ignore the tweet and user ids, and keep the sentiment and tweet text
      tweetdata.append(line.split('\t')[2:4])
  
  #for tweet in tweetdata[:10]:
    #print (tweet)
  
  # create list of tweet documents as (list of words, label)
  # where the labels are condensed to just 3:  'pos', 'neg', 'neu'
  tweetdocs = []
  # add all the tweets except the ones whose text is Not Available
  for tweet in tweetdata:
    if (tweet[1] != 'Not Available'):
        
      # run the tweet tokenizer on the text string - returns unicode tokens, so convert to utf8
      tokens = twtokenizer.tokenize(tweet[1])
      
      #Handling Emoticons
      tokens = [re.sub(':\)','happy', token) for token in tokens]
      tokens = [re.sub(':\(','sad', token) for token in tokens]
        
      #Removing URLS
      tokens = [re.sub(r"http\S+",'',token) for token in tokens]
      
      #Removing mentions
      tokens = [re.sub(r"@\S+", "", token) for token in tokens]
    
      # Convert to lowercase
      tokens = [token.lower() for token in tokens]
      
      # Remove punctuation
      tokens = [token for token in tokens if token not in string.punctuation]
      
      #Remove words with numbers (eg 11th)
      tokens = [re.sub(r'\S*\d\S*', '', token).strip() for token in tokens]
      tokens = [token for token in tokens if token]
    
      def decontracted(phrase):
    
         # specific
        phrase = re.sub(r"won\'t", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
        phrase = re.sub(r"\'ve", " have", phrase)
        phrase = re.sub(r"\'m", " am", phrase)
        return phrase
      
      tokens = [decontracted(token) for token in tokens]
    
      # Remove stopwords
      with open('stopwords_twitter.txt', 'r') as file:
        stop_words = set(file.read().splitlines())
      tokens = [token for token in tokens if token not in stop_words]
      
      # Lemmatization
      lemmatizer = WordNetLemmatizer()
      tokens = [lemmatizer.lemmatize(token) for token in tokens]
      
      
        
      if tweet[0] == '"positive"':
        label = 'pos'
      else:
        if tweet[0] == '"negative"':
          label = 'neg'
        else:
          if (tweet[0] == '"neutral"') or (tweet[0] == '"objective"') or (tweet[0] == '"objective-OR-neutral"'):
            label = 'neu'
          else:
            label = ''
      tweetdocs.append((tokens, label))
  
  # print a few
 # for tweet in tweetdocs[:10]:
    #print (tweet)
  #Saving the cleaned data for EDA (one time procedure)
  df = pd.DataFrame(tweetdocs, columns=['tokens','labels'])
  df.to_csv('tweetdocs1.csv',index=False)
    
#----------------------------------------------------------------------------------------------------------------------#  
  # continue as usual to get all words and create word features
    

  # feature sets from a feature definition function
  
   # Initialize the AFINN sentiment lexicon
  afinn = Afinn()

  # Function to generate AFINN sentiment lexicon features
  def afinn_sentiment_features(tokens):
    # Create a dictionary to store the AFINN sentiment scores
    afinn_scores = {}
    
    # Calculate the AFINN sentiment score for each token
    for token in tokens:
        afinn_scores[token] = afinn.score(token)
    
    return afinn_scores

  # List to store feature sets
  feature_sets = []

  # Iterate over each entry
  for entry in tweetdocs:
    tokens = entry[0]  # Extract tokens from the entry
    label = entry[1]  # Extract label from the entry
    
    # Generate AFINN sentiment lexicon features for the tokens
    features = afinn_sentiment_features(tokens)
    
    # Create a tuple of feature dictionary and label
    feature_set = (features, label)
    
    # Add the feature set to the list
    feature_sets.append(feature_set)

#-----------------------------------------------------------------------------------------------------------------------#
    # train and test a classifier
  train_set = feature_sets[:8000]
  test_set = feature_sets[8000:]
    
  nb_classifier = NaiveBayesClassifier.train(train_set)
  print('Accuracy: ',nltk.classify.accuracy(nb_classifier,test_set))
  
  # Precision, Recall, and F1-score
  refsets = collections.defaultdict(set)
  testsets = collections.defaultdict(set)

  for i, (features, label) in enumerate(test_set):
    refsets[label].add(i)
    observed = nb_classifier.classify(features)
    testsets[observed].add(i)

  prec = precision(refsets['pos'], testsets['pos'])
  rec = recall(refsets['pos'], testsets['pos'])
  f1 = f_measure(refsets['pos'], testsets['pos'])

  print("Precision:", prec)
  print("Recall:", rec)
  print("F1 Score:", f1)
    
  # show most informative features'''      
  print(nb_classifier.most_informative_features(25))

In [27]:
tweetdocs = processtweets('.',10000)

Accuracy:  0.5721153846153846
Precision: 0.782608695652174
Recall: 0.6206896551724138
F1 Score: 0.6923076923076923
[('fuck', -4.0), ('excited', 3.0), ('sorry', -1.0), ('amazing', 4.0), ('luck', 3.0), ('fun', 4.0), ('injury', -2.0), ('great', 3.0), ('thank', 2.0), ('sad', -2.0), ('awesome', 4.0), ('thanks', 2.0), ('missing', -2.0), ('anymore', 0.0), ('happy', 3.0), ('cant', 0.0), ('cry', -1.0), ('fucking', -4.0), ('bitch', -5.0), ('could not', 0.0), ('brilliant', 4.0), ('exciting', 3.0), ('suck', -3.0), ('interesting', 2.0), ('hate', -3.0)]


In [23]:
#!pip install afinn