In [1]:
import time
import pandas as pd
import numpy as np
import os

import preprocessing as pre

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.externals import joblib
from sklearn import metrics

if __name__ == '__main__':
        # Access the csv file
    df = pd.read_csv('.\\dataset\\labelled_tweets.csv', skipinitialspace=True, usecols=['text','polarity'])
    df.dropna(how='any')

    # print(type(df))
    text = df['text']
    label = df['polarity']

    # Split the tweets in training and test set
    def split_data(data):
        # shuffle data
        data.reindex(np.random.permutation(data.index))

        # train, test data
        train, test =train_test_split(data, test_size = 0.3)

        train.to_csv(".\\dataset\\train_data.csv")
        test.to_csv(".\\dataset\\test_data.csv")

#     split_data(df)

    train = pd.read_csv(".\\dataset\\train_data.csv", skipinitialspace=True, encoding='latin1')
    train_text = train['text']
    train_label = train['polarity']

    test = pd.read_csv(".\\dataset\\test_data.csv", skipinitialspace=True, encoding='latin1')
    test_text = test['text']
    test_label = test['polarity']

    # Remove handles and hashtags
    # TODO: Review removal of hyperlinks
    def normalized_data(dataset):
        cleaned_text = []
        for index, item in enumerate(dataset.text):
            if isinstance(item, str):
                cleaned_text.append(pre.process_text(item))
        return cleaned_text

    def polarity(dataset):
        polarity = []
        for index, item in enumerate(dataset.polarity):
            if isinstance(item, str):
                polarity.append(pre.process_text(item))
        return polarity
    
    def classifier_report(classifier,time_train,time_predict,predicted):
        print("Results for " + classifier + "\n")
        print("Training time: %fs; Prediction time: %fs" % (time_train, time_predict))
        print(classification_report(polarity(test), predicted))
        score = metrics.accuracy_score(polarity(test), predicted)
        print("Accuracy:   %0.3f" % score)

    def nb_classifier():
        text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',BernoulliNB())])
        t0 = time.time()
        text_clf=text_clf.fit(normalized_data(train), polarity(train))
        t1 = time.time()
        prediction_nb = text_clf.predict(normalized_data(test))
        t2 = time.time()
        time_nb_train = t1-t0
        time_nb_predict = t2-t1
        classifier_report("MultinomialNB", time_nb_train, time_nb_predict, prediction_nb)

    def svm_classifer():
        text_clf_svm = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42)),])
        t0 = time.time()
        # Check if a pickled file exists
        pickle = os.path.exists('text_clf_svm.pkl')
        if pickle:
            print("Pickle file exists. Loading pickle ... \n")
            text_clf_svm = joblib.load('text_clf_svm.pkl')
        else:
            text_clf_svm = text_clf_svm.fit(normalized_data(train), polarity(train))
            # Create a pickle
            joblib.dump(text_clf_svm, 'text_clf_svm.pkl')
        t1 = time.time()
        prediction_svm = text_clf_svm.predict(normalized_data(test))
        t2 = time.time()
        time_svm_train = t1-t0
        time_svm_predict = t2-t1
        classifier_report("SGDClassifier", time_svm_train, time_svm_predict, prediction_svm)
        return text_clf_svm
    
#     print("Training with Naive Bayes ...")
#     nb_classifier()
# TODO: Review the logging functionality
    print("Training with Support Vector Machine ...\n")
    svm_classifer()

Training with Support Vector Machine ...

Pickle file exists. Loading pickle ... 

Results for SGDClassifier

Training time: 0.067883s; Prediction time: 0.084023s
             precision    recall  f1-score   support

   negative       0.79      0.82      0.80       218
    neutral       0.81      0.86      0.83       289
   positive       1.00      0.26      0.41        35

avg / total       0.81      0.80      0.79       542

Accuracy:   0.803


In [6]:
# We don't execute everything unless we're viewing in a notebook
IN_JUPYTER = 'get_ipython' in globals() and get_ipython().__class__.__name__ == "ZMQInteractiveShell"
if IN_JUPYTER:
    svm_classifer().predict(["I got it. Thanks"])

Pickle file exists. Loading pickle ... 

Results for SGDClassifier

Training time: 0.061009s; Prediction time: 0.017000s
             precision    recall  f1-score   support

   negative       0.79      0.82      0.80       218
    neutral       0.81      0.86      0.83       289
   positive       1.00      0.26      0.41        35

avg / total       0.81      0.80      0.79       542

Accuracy:   0.803


In [7]:
# parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),}

# gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
# gs_clf = gs_clf.fit(normalized_data(train), polarity(train))
# gs_clf.best_score_
# gs_clf.best_params_

True