In [73]:
import os
import nltk
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pjoin = os.path.join

In [78]:
# Read pre-processed data from pickle file
pkl_dir = './data/tweet/sentiment_analysis/pkl'
processed_df = pd.read_pickle(pjoin(pkl_dir, 'processed_tweets_sentiments.pkl'))

# Swap tweet and sentiment columns
processed_df = processed_df.reindex(columns=['Tweet', 'Sentiment'])

print(processed_df['Sentiment'].unique())
processed_df.head()

[0 4]


Unnamed: 0,Tweet,Sentiment
0,"- awww, that's a bummer. you shoulda got da...",0
1,is upset that he can't update his facebook by ...,0
2,i dived many times for the ball. managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it's not behaving at all. i'm mad. why am...",0


In [62]:
# Divide 1.6M tweets into training (~80%) and testing (~20%) sets!
mask = np.random.rand(len(processed_df)) < 0.8
training_df = processed_df[mask]
test_df = processed_df[~mask]

num_train, num_test = len(training_df), len(test_df)
percent_train, percent_test = num_train/len(mask), num_test/len(mask)

print(f'Number of training samples: {len(training_df)}, ({percent_train*100:.1f} %)')
print(f'Number of testing samples: {len(test_df)} ({percent_test*100:.1f} %)')

Number of training samples: 1279922, (80.0 %)
Number of testing samples: 320078 (20.0 %)


In [9]:
def build_vocabulary(training_df):
    '''
    Given the pre-processed training data in a pandas dataframe, 
    construct the whole vocabulary set resident in the training data.
    '''
    all_words = []
    
    for (index, data) in tqdm(training_df.iterrows()):
        # data['Tweet'] contains the list of words in that tweet
        all_words.extend(data['Tweet'])
        
        
    wordlist = nltk.FreqDist(all_words)
    word_features = wordlist.keys()
    
    return word_features

# Get list of words (the whole vocabulary in the set)
word_features = build_vocabulary(training_df)

400525it [00:36, 11024.04it/s]


In [15]:
def extract_features(tweet):
    '''
    For every word in word_features, compare with the words in tweet and create a label:
    
    Label 1 (true): Word in vocabulary is resident in tweet
    Label 0 (false): Word in vocabulary is not resident in tweet
    '''
    tweet_words = set(tweet)
    features = {}
    for word in word_features:
        features[f'Contains {word}'] = word in tweet_words
    
    return features

# Transform training dataframe to a list of lists
# for compatibility with NLTK built-in function
training_list = training_df.iloc[:100].values.tolist()

training_features = nltk.classify.apply_features(extract_features, training_list)

In [17]:
# Train the Naive Bayes Classifier
import time
start_time = time.time()
print(f'Start: {start_time}')
bayes_classifier = nltk.NaiveBayesClassifier.train(training_features)
end_time = time.time()
print(f'End: {end_time}')
diff = end_time - start_time
print(f'Time taken: {diff}')


Start: 1585894743.1434581
End: 1585894767.7725508
Time taken: 24.629092693328857


In [63]:
X_train_tweets = training_df['Tweet'].values.tolist()
X_train_sentiments = training_df['Sentiment'].values.tolist()
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train_tweets)
X_train_counts.shape

(1279922, 237696)

In [64]:
# Divide the occurences with total number of words in each tweet (normalize)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1279922, 237696)

In [74]:
# Train the classifier with normalized frequency distributions
clf = MultinomialNB()
clf.fit(X_train_tfidf, X_train_sentiments)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [85]:
# Test the model
X_test_tweets = test_df['Tweet'].values.tolist()
X_test_sentiments = test_df['Sentiment'].values.tolist()
X_test_counts = count_vect.transform(X_test_tweets)
print(X_test_counts.shape)

clf.score(X_test_counts, X_test_sentiments)

(320078, 237696)


0.7823155605821082

In [76]:
clf.classes_

array([0, 4])

In [None]:
clf.score(X_)