Libraries

In [1]:
import nltk
import pickle
import random
nltk.download('twitter_samples')
from nltk.corpus import twitter_samples,stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
nltk.download('averaged_perceptron_tagger')
from nltk import NaiveBayesClassifier
from nltk import classify

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
def clean_data(token):
    return [item for item in token if not item.startswith('@') and not item.startswith('http')]

In [3]:
def to_lower(token):
    return [item.lower() for item in token]

In [4]:
def lemmatize(token):
    lemmatizer=WordNetLemmatizer()
    result=[]
    for item,tag in pos_tag(token):
        if tag[0].lower() in 'nva':
            result.append(lemmatizer.lemmatize(item,tag[0].lower()))
        else:
            result.append((lemmatizer.lemmatize(item)))
     
    
    return result
    
    

In [5]:
def remove_stop_words(token,stop_words):
    return [item for item in token if item not in stop_words]

In [6]:
def transform_features(token):
    feature_set={}
    for feature in token:
        if feature not in feature_set:
            feature_set[feature]=0
        feature_set[feature]+=1
    return feature_set
    

In [7]:
def main():
    #step 1:Gather the data
    positive_tweets=twitter_samples.tokenized('positive_tweets.json')
    negative_tweets=twitter_samples.tokenized('negative_tweets.json')
    print(positive_tweets[0])
    print(negative_tweets[0])
    #step 2:clean,lemmatize and remove stop words from data
    stop_words=stopwords.words('english')
    positive_tweets=[remove_stop_words(lemmatize(clean_data(to_lower(item))),stop_words) for item in positive_tweets]
    negative_tweets=[remove_stop_words(lemmatize(clean_data(to_lower(item))),stop_words) for item in negative_tweets]
    print(positive_tweets[0])
    print(negative_tweets[0])
    #step 3:Transform the data
    positive_tweets=[(transform_features(token),'Positive') for token in positive_tweets]
    negative_tweets=[(transform_features(token),'Negative') for token in negative_tweets]
    print(positive_tweets[0])
    print(negative_tweets[0])
    #step 4:create data set
    dataset=positive_tweets+negative_tweets
    random.shuffle(dataset)
    training_data=dataset[:7000]
    test_data=dataset[7000:]
    #step 5:train the model
    classifier=NaiveBayesClassifier.train(training_data)
    #step 6:test accuracy
    print('Accuracy:',classify.accuracy(classifier,test_data))
    print(classifier.show_most_informative_features(10))
    #step 7:save the model
    with open("my_classifier.pickle","wb") as f:
        pickle.dump(classifier,f)

In [8]:
if __name__=="__main__":
    main()

['#FollowFriday', '@France_Inte', '@PKuchly57', '@Milipol_Paris', 'for', 'being', 'top', 'engaged', 'members', 'in', 'my', 'community', 'this', 'week', ':)']
['hopeless', 'for', 'tmr', ':(']
['#followfriday', 'top', 'engage', 'member', 'community', 'week', ':)']
['hopeless', 'tmr', ':(']
({'#followfriday': 1, 'top': 1, 'engage': 1, 'member': 1, 'community': 1, 'week': 1, ':)': 1}, 'Positive')
({'hopeless': 1, 'tmr': 1, ':(': 1}, 'Negative')
Accuracy: 0.9966666666666667
Most Informative Features
                      :( = 1              Negati : Positi =   2012.3 : 1.0
                      :) = 1              Positi : Negati =   1619.3 : 1.0
                       ( = 2              Negati : Positi =     48.4 : 1.0
                follower = 1              Positi : Negati =     33.6 : 1.0
                     sad = 1              Negati : Positi =     25.1 : 1.0
                  arrive = 1              Positi : Negati =     17.8 : 1.0
                    love = 2              Positi :