###Importing library and prerequisite tools

In [1]:
import nltk
nltk.download('twitter_samples')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
from nltk.corpus import twitter_samples, stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from nltk import FreqDist, classify, NaiveBayesClassifier

[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
import re,string,random

##Preprocessing functions


 *Remove* *noise*

Noise is any part of the text that doesn't add meaning or information to data.
In twitter dataset, with the help of regex we will search for and remove these items:
1. Hyperlinks
2. Twitter handles
3. Punctuation and special characters

In [3]:
def remove_noise(tweet_tokens,stop_words=()):
  tokens=[]
  for token,tag in pos_tag(tweet_tokens):
    token=re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','',token)
    token=re.sub("(@[A-Za-z0-9_]+)","",token)
    if tag.startswith("NN"):
      pos='n'
    elif tag.startswith("VB"):
      pos='v'
    else:
      pos='a'
    lemmatizer=WordNetLemmatizer()
    token=lemmatizer.lemmatize(token,pos)
    if len(token)>0 and token not in string.punctuation and token.lower() not in stop_words:
      tokens.append(token.lower())
  return tokens


Determining word density

In [4]:
def get_all_words(tokens_list):
  for tokens in tokens_list:
    for token in tokens:
      yield token

Converting Tokens to a dictionary 

In [5]:
def get_tweets_for_model(tokens_list):
   for tweet_tokens in tokens_list:
      yield dict([token,True] for token in tweet_tokens)

###Data Preparation

In [6]:
positive_tweets=twitter_samples.strings('positive_tweets.json')
negative_tweets=twitter_samples.strings('negative_tweets.json')
text=twitter_samples.strings('tweets.20150430-223406.json')
tweet_tokens=twitter_samples.tokenized('positive_tweets.json')[0]

In [7]:
stop_words=stopwords.words('english')

In [8]:
positive_tweets_tokens= twitter_samples.tokenized('positive_tweets.json')
negative_tweets_tokens= twitter_samples.tokenized('negative_tweets.json')

In [9]:
positive_cleaned_tokens_list=[]
negative_cleaned_tokens_list=[]

In [10]:
for tokens in positive_tweets_tokens:
  positive_cleaned_tokens_list.append(remove_noise(tokens,stop_words))
for tokens in negative_tweets_tokens:
  negative_cleaned_tokens_list.append(remove_noise(tokens,stop_words))  

In [11]:
all_pos_words=get_all_words(positive_cleaned_tokens_list)
freq_dist_pos=FreqDist(all_pos_words)
print(freq_dist_pos.most_common(10))

[(':)', 3691), (':-)', 701), (':d', 658), ('thanks', 388), ('follow', 357), ('love', 333), ('...', 290), ('good', 283), ('get', 263), ('thank', 253)]


In [12]:
positive_tokens_for_model=get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model=get_tweets_for_model(negative_cleaned_tokens_list)

In [13]:
positive_dataset=[(tweet_dict,"Positive") for tweet_dict in positive_tokens_for_model]
negative_dataset=[(tweet_dict,"Negative") for tweet_dict in negative_tokens_for_model]

In [14]:
dataset=positive_dataset+negative_dataset

In [15]:
random.shuffle(dataset)

###Split dataset into training and test set

In [16]:
train_data=dataset[:7000]
test_data=dataset[7000:]

###Training model

In [17]:
classifier=NaiveBayesClassifier.train(train_data)

In [18]:
print("Accuracy is: {:.2f}".format(classify.accuracy(classifier,test_data)*100))

Accuracy is: 99.37


In [19]:
print(classifier.show_most_informative_features(10))

Most Informative Features
                      :) = True           Positi : Negati =   1649.7 : 1.0
                follower = True           Positi : Negati =     37.7 : 1.0
                  arrive = True           Positi : Negati =     31.7 : 1.0
                     sad = True           Negati : Positi =     23.3 : 1.0
                     bam = True           Positi : Negati =     21.7 : 1.0
                    glad = True           Positi : Negati =     21.7 : 1.0
                     x15 = True           Negati : Positi =     16.3 : 1.0
                 welcome = True           Positi : Negati =     16.1 : 1.0
              appreciate = True           Positi : Negati =     15.7 : 1.0
                    damn = True           Negati : Positi =     15.7 : 1.0
None


###Validating on custom tweet

In [20]:
custom_tweet="I ordered from Zomato, they screwed up, never used the app again."
custom_tokens=remove_noise(word_tokenize(custom_tweet))
print(custom_tweet,classifier.classify(dict([token, True] for token in custom_tokens)))

I ordered from Zomato, they screwed up, never used the app again. Negative
