In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

In [8]:
path = "/content/drive/My Drive/Colab Notebooks/data/training.1600000.processed.noemoticon.csv"

In [10]:
entire_sentiment140_df = pd.read_csv(path, header=None, encoding="latin")

In [20]:
import re

def remove_urls(tweet):
  return re.sub(r'www\.\S+|https?://\S+', '', tweet)

def remove_html_character_entities(tweet):
  return re.sub(r'&[a-zA-Z]+;', '', tweet)

def remove_at_mentions(tweet):
  return re.sub(r'@\S+', '', tweet)

def remove_non_alpha_or_space_characters(tweet):        
    return re.sub(r'[^a-zA-Z\s]', '', tweet)

def remove_short_words(tweet):
  return re.sub(r'\b\w{1,2}\b', '', tweet)

def preprocess_tweet(tweet):
  tweet = remove_urls(tweet)
  tweet = remove_html_character_entities(tweet)
  tweet = remove_at_mentions(tweet)
  tweet = remove_non_alpha_or_space_characters(tweet)
  return remove_short_words(tweet)

In [54]:
entire_sentiment140_df[5] = entire_sentiment140_df[5].map(preprocess_tweet)
entire_sentiment140_df

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,Awww thats bummer You shoulda got David C...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,upset that cant update his Facebook texting...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,dived many times for the ball Managed save ...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,whole body feels itchy and like its fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,its not behaving all mad why here becaus...
...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke Having school the best feeling ever
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDBcom Very cool hear old Walt interviews
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover Ask for ...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy Birthday boo alll time Tupac Amaru S...


In [33]:
pip install --upgrade scikit-learn

Collecting scikit-learn
[?25l  Downloading https://files.pythonhosted.org/packages/5c/a1/273def87037a7fb010512bbc5901c31cfddfca8080bc63b42b26e3cc55b3/scikit_learn-0.23.2-cp36-cp36m-manylinux1_x86_64.whl (6.8MB)
[K     |████████████████████████████████| 6.8MB 2.7MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading https://files.pythonhosted.org/packages/f7/12/ec3f2e203afa394a149911729357aa48affc59c20e2c1c8297a60f33f133/threadpoolctl-2.1.0-py3-none-any.whl
Installing collected packages: threadpoolctl, scikit-learn
  Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.23.2 threadpoolctl-2.1.0


In [62]:
from sklearn.utils import shuffle

entire_sentiment140_df = shuffle(entire_sentiment140_df)
entire_sentiment140_df.reset_index(inplace=True, drop=True)
entire_sentiment140_df

Unnamed: 0,0,1,2,3,4,5
0,4,1957039268,Thu May 28 23:20:55 PDT 2009,NO_QUERY,vallieclb,Sorry hear that you didnt get but congrats ...
1,0,2064649351,Sun Jun 07 06:49:52 PDT 2009,NO_QUERY,joa_19,Working again
2,4,2001778671,Tue Jun 02 02:02:33 PDT 2009,NO_QUERY,bill_archie,dont have Haagen Dazs heremom said too expen...
3,0,2241704001,Fri Jun 19 11:16:32 PDT 2009,NO_QUERY,xlossforwordsx,Everyone has iphones wtf All keep seein post...
4,4,1822828054,Sat May 16 20:28:54 PDT 2009,NO_QUERY,elysiabrooker,Well have talk more about Vegas love hea...
...,...,...,...,...,...,...
1599995,4,2014369825,Wed Jun 03 01:18:12 PDT 2009,NO_QUERY,funkymonk1,you should watching weeds much more entertai...
1599996,0,1753884456,Sun May 10 03:57:17 PDT 2009,NO_QUERY,SamanthaHopeB,crap finals have actually turned nocturnalor ...
1599997,0,2013640561,Tue Jun 02 23:10:59 PDT 2009,NO_QUERY,radiominnie,want one too
1599998,0,2052727083,Sat Jun 06 01:47:17 PDT 2009,NO_QUERY,Jodz101,going watch some supernatural there nothing ...


In [63]:
from nltk.corpus import stopwords

In [64]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

text_classifier = Pipeline([
  ('vectorizer', HashingVectorizer(stop_words=stopwords.words('english'),
                                   alternate_sign=False)),
  ('tfidf_transformer', TfidfTransformer()),
  ('classifier', MultinomialNB()),
])

In [65]:
from sklearn.model_selection import GridSearchCV

parameters = {      
  'classifier__alpha': (1e-2, 1e-3), 
}

grid_search_classifier = GridSearchCV(text_classifier, parameters, cv=5, n_jobs=-2)

In [66]:
import time

training_samples = entire_sentiment140_df.iloc[:,-1] 
training_labels = entire_sentiment140_df.iloc[:,0]
tic = time.perf_counter()
grid_search_classifier = grid_search_classifier.fit(training_samples, training_labels)
toc = time.perf_counter()
print(f"Took {toc - tic:0.4f} seconds to fit classifer on data")

Took 174.2224 seconds to fit classifer on data


In [67]:
print(f"Best score: {grid_search_classifier.best_score_}")

Best score: 0.7454725000000001


In [68]:
import pickle
with open('test_grid_search_NB_clf_sentiment140.pkl', 'wb') as f:
  pickle.dump(grid_search_classifier, f)