In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
train = pd.read_csv('./data/train.csv', encoding='latin-1')
test = pd.read_csv('./data/test.csv', encoding='latin-1')

# 2. Creating bag of words
# Bag of words: A list (vocabulary) with all the unique words in the whole corpus of tweets. 
tweets = [
    'This is amazing!',
    'ML is the best, yes it is',
    'I am not sure about how this is going to end...'
]
count = CountVectorizer()

"""
fit_transform: Learn the vocabulary dictionary and return document-term matrix
fit, followed by transform
"""
bag = count.fit_transform(tweets)

"""
vocabulary_: 
    Words learnt by the vectorizer
Output: 
    {'this': 13, 'is': 7, 'amazing': 2,...
"""
print(count.vocabulary_)

"""
toarray(): 
    Array of array of inputs provided, with 0s and 1s based on their vocabulary positions
Output: 
    [[0 0 1 0...], [], []], where every number is the presence/absence of a word in that input.
    Example: the word "about" from vocabulary occurs only in last sentence, hence 0 in others
    Also, word "is" occurs twice in 2nd statement, henc its count 0 in the output
    
    This is also called raw term frequencies, i.e. the number of times a term t occurs in a document d.
"""
print(bag.toarray())

# 3. Identifying how relevant words are in vocabulary
"""
  - Using term frequency-inverse document frequency, while calculating raw term frequencies, so that
    it reduces the score the more frequent the word is across all the tweets.
"""
import numpy as np
from sklearn.feature_extraction.text import TfidfTransformer # (term frequency-inverse document frequency)
tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) # passing normalization parameter
np.set_printoptions(precision=2)

print(tfidf.fit_transform(bag).toarray())


# 4. Data clean up
from collections import Counter

vocab = Counter()
# creating a vocabulary of all words in the training data
for tweet in train["SentimentText"]:
    for word in tweet.split(' '):
        vocab[word] += 1
        
print(vocab.most_common(20))

from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

import math
def plot_distribution(vocabulary):
    
    hist, edges = np.histogram(list(map(lambda x:math.log(x[1]),vocabulary.most_common())), density=True, bins=500)

    p = figure(tools="pan,wheel_zoom,reset,save",
               toolbar_location="above",
               title="Word distribution accross all twits")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555", )
    show(p)
    
plot_distribution(vocab)


import nltk
# nltk.download("stopwords")



from nltk.corpus import stopwords
stop = stopwords.words('english')

vocab_reduced = Counter()
for w, c in vocab.items():
    if not w in stop:
        vocab_reduced[w]=c

print(vocab_reduced.most_common(20))


# 5. Preprocessing data
import re
def preprocessor(text):
    """ Return a cleaned version of text
    """
    # Remove HTML markup
    text = re.sub('<[^>]*>', '', text)
    # Save emoticons for later appending
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    # Remove any non-word character and append the emoticons,
    # removing the nose character for standarization. Convert to lower case
    text = (re.sub('[\W]+', ' ', text.lower()) + ' ' + ' '.join(emoticons).replace('-', ''))
    
    return text

print(preprocessor('This!! tweet man :) is <b>nice</b>'))

"""
 -  Also performing stemming, the process of reducing a word to its root is.
    Example: love, loving etc all have same root love.
"""

from nltk.stem import PorterStemmer
porter = PorterStemmer()

# normal tokenizer
def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

print(tokenizer('Hi there, I am loving this, like with a lot of love'))
print(tokenizer_porter('Hi there, I am loving this, like with a lot of love'))


# 6. Starting with train test split of data
from sklearn.model_selection import train_test_split
X = train['SentimentText']
y = train['Sentiment']
"""
    stratify will create a train set with the same class balance than the original set
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0, stratify=y)


# 7. Summing it all up

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

# writing param grid with properties like stop_words, preprocessor etc
param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__preprocessor': [None, preprocessor],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

# using pipeline to create a sequential flow of items
lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0, penalty='l1', solver='liblinear'))])


# using GridCV for training
gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1)

# training the model
gs_lr_tfidf.fit(X_train, y_train)


{'this': 13, 'is': 7, 'amazing': 2, 'ml': 9, 'the': 12, 'best': 3, 'yes': 15, 'it': 8, 'am': 1, 'not': 10, 'sure': 11, 'about': 0, 'how': 6, 'going': 5, 'to': 14, 'end': 4}
[[0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0]
 [0 0 0 1 0 0 0 2 1 1 0 0 1 0 0 1]
 [1 1 0 0 1 1 1 1 0 0 1 1 0 1 1 0]]
[[0.   0.   0.72 0.   0.   0.   0.   0.43 0.   0.   0.   0.   0.   0.55
  0.   0.  ]
 [0.   0.   0.   0.4  0.   0.   0.   0.47 0.4  0.4  0.   0.   0.4  0.
  0.   0.4 ]
 [0.33 0.33 0.   0.   0.33 0.33 0.33 0.2  0.   0.   0.33 0.33 0.   0.25
  0.33 0.  ]]
[('', 123920), ('I', 32880), ('to', 28810), ('the', 28087), ('a', 21321), ('you', 21180), ('i', 15996), ('and', 14565), ('it', 12818), ('my', 12385), ('for', 12149), ('in', 11199), ('is', 11185), ('of', 10326), ('that', 9181), ('on', 9020), ('have', 8991), ('me', 8255), ('so', 7612), ('but', 7220)]


[('', 123920), ('I', 32880), ("I'm", 6416), ('like', 5086), ('-', 4922), ('get', 4864), ('u', 4194), ('good', 3953), ('love', 3494), ('know', 3472), ('go', 2990), ('see', 2868), ('one', 2787), ('got', 2774), ('think', 2613), ('&amp;', 2556), ('lol', 2419), ('going', 2396), ('really', 2287), ('im', 2200)]
this tweet man is nice :)
['Hi', 'there,', 'I', 'am', 'loving', 'this,', 'like', 'with', 'a', 'lot', 'of', 'love']
['hi', 'there,', 'i', 'am', 'love', 'this,', 'like', 'with', 'a', 'lot', 'of', 'love']
Fitting 5 folds for each of 96 candidates, totalling 480 fits


In [6]:
# getting best model
print('Best parameter set: ' + str(gs_lr_tfidf.best_params_))
print('Best accuracy: %.3f' % gs_lr_tfidf.best_score_)

Best parameter set: {'clf__C': 1.0, 'clf__penalty': 'l2', 'vect__ngram_range': (1, 1), 'vect__preprocessor': <function preprocessor at 0x00000184950FF3A0>, 'vect__stop_words': None, 'vect__tokenizer': <function tokenizer at 0x000001849A845550>}
Best accuracy: 0.772


In [7]:
clf = gs_lr_tfidf.best_estimator_
print('Accuracy in test: %.3f' % clf.score(X_test, y_test))

Accuracy in test: 0.770


In [9]:
tweets = [
    "This is really bad, I don't like it at all",
    "I love this!",
    ":)",
    "I'm sad... :("
]

# using model to predict
clf.predict(tweets)

array([0, 1, 1, 0], dtype=int64)

In [10]:
import pickle
import os

# saving model
pickle.dump(clf, open(os.path.join('data', 'logisticRegression.pkl'), 'wb'), protocol=4)