In [2]:
import os
import nltk
import pandas as pd
import numpy as np
import pickle
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

pjoin = os.path.join

In [3]:
# Read pre-processed data from pickle file
pkl_dir = './data/tweet/sentiment_analysis/pkl'
processed_df = pd.read_pickle(pjoin(pkl_dir, 'processed_tweets_sentiments.pkl'))

# Swap tweet and sentiment columns
processed_df = processed_df.reindex(columns=['Tweet', 'Sentiment'])

print(processed_df['Sentiment'].unique())
processed_df.head()

[0 4]


Unnamed: 0,Tweet,Sentiment
0,"- awww, that's a bummer. you shoulda got da...",0
1,is upset that he can't update his facebook by ...,0
2,i dived many times for the ball. managed to s...,0
3,my whole body feels itchy and like its on fire,0
4,"no, it's not behaving at all. i'm mad. why am...",0


In [4]:
# Divide 1.6M tweets into training (~80%) and testing (~20%) sets!
mask = np.random.rand(len(processed_df)) < 0.8
training_df = processed_df[mask]
test_df = processed_df[~mask]

num_train, num_test = len(training_df), len(test_df)
percent_train, percent_test = num_train/len(mask), num_test/len(mask)

print(f'Number of training samples: {len(training_df)}, ({percent_train*100:.1f} %)')
print(f'Number of testing samples: {len(test_df)} ({percent_test*100:.1f} %)')

Number of training samples: 1278818, (79.9 %)
Number of testing samples: 321182 (20.1 %)


## Training & Testing the Classifier

Here, we train the sentiment classifier on 80% of the data, and test on the remaining 20%.

In [5]:
X_train_tweets = training_df['Tweet'].values.tolist()
X_train_sentiments = training_df['Sentiment'].values.tolist()
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train_tweets)
X_train_counts.shape

(1278818, 237222)

In [6]:
# Divide the occurences with total number of words in each tweet (normalize)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(1278818, 237222)

In [7]:
# Train the classifier with normalized frequency distributions
clf = MultinomialNB()
clf.fit(X_train_tfidf, X_train_sentiments)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [8]:
# Test the model
X_test_tweets = test_df['Tweet'].values.tolist()
X_test_sentiments = test_df['Sentiment'].values.tolist()
X_test_counts = count_vect.transform(X_test_tweets)
print(X_test_counts.shape)

clf.score(X_test_counts, X_test_sentiments)

(321182, 237222)


0.7822916601802093

In [9]:
# Pickle the model for later use
pkl_dir = './data/tweet/sentiment_analysis/pkl'
pkl_file = pjoin(pkl_dir, 'classifier.pkl')

with open(pkl_file, 'wb+') as f:
    pickle.dump(clf, f)    

print(f'Model saved to file: {pkl_file}')

# Pickle the vectorizer as well for later use
pkl_file_vect = pjoin(pkl_dir, 'vectorizer.pkl')

with open(pkl_file_vect, 'wb+') as f:
    pickle.dump(count_vect, f)

print(f'Vectorizer saved to file: {pkl_file_vect}')

Model saved to file: ./data/tweet/sentiment_analysis/pkl/classifier.pkl
Vectorizer saved to file: ./data/tweet/sentiment_analysis/pkl/vectorizer.pkl
