In [1]:
import time
import pandas as pd
import numpy as np

import preprocessing as pre

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn import metrics

if __name__ == '__main__':
        # Access the csv file
    df = pd.read_csv('.\\dataset\\labelled_tweets.csv', skipinitialspace=True, usecols=['text','polarity'])
    df.dropna(how='any')

    # print(type(df))
    text = df['text']
    label = df['polarity']

    # Split the tweets in training and test set
    def split_data(data):
        # shuffle data
        data.reindex(np.random.permutation(data.index))

        # train, test data
        train, test =train_test_split(data, test_size = 0.3)

        train.to_csv(".\\dataset\\train_data.csv")
        test.to_csv(".\\dataset\\test_data.csv")

    split_data(df)

    train = pd.read_csv(".\\dataset\\train_data.csv", skipinitialspace=True, encoding='latin1')
    train_text = train['text']
    train_label = train['polarity']

    test = pd.read_csv(".\\dataset\\test_data.csv", skipinitialspace=True, encoding='latin1')
    test_text = test['text']
    test_label = test['polarity']

    # Remove handles and hashtags
    # TODO: Review removal of hyperlinks
    def normalized_data(dataset):
        cleaned_text = []
        for index, item in enumerate(dataset.text):
            if isinstance(item, str):
                cleaned_text.append(pre.process_text(item))
        return cleaned_text

    def polarity(var):
        polarity = []
        for index, item in enumerate(var.polarity):
            if isinstance(item, str):
                polarity.append(pre.process_text(item))
        return polarity


    text_clf=Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',MultinomialNB())])
    text_clf=text_clf.fit(normalized_data(train), polarity(train))

    text_clf_svm = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42)),])
    t0 = time.time()
    text_clf_svm=text_clf_svm.fit(normalized_data(train), polarity(train))
    t1 = time.time()
    predicted = text_clf_svm.predict(normalized_data(test))
    t2 = time.time()
    time_svm_train = t1-t0
    time_svm_predict = t2-t1
    np.mean(predicted == polarity(test))
    
    print("Results for SVC(kernel=rbf)")
    print("Training time: %fs; Prediction time: %fs" % (time_svm_train, time_svm_predict))
    print(classification_report(polarity(test), predicted))
    score = metrics.accuracy_score(polarity(test), predicted)
    print("accuracy:   %0.3f" % score)
    # parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf__alpha': (1e-2, 1e-3),}

    # gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
    # gs_clf = gs_clf.fit(normalized_data(train), polarity(train))

Results for SVC(kernel=rbf)
Training time: 0.126516s; Prediction time: 0.014996s
             precision    recall  f1-score   support

   negative       0.75      0.77      0.76       218
    neutral       0.80      0.84      0.82       294
   positive       1.00      0.35      0.52        31

avg / total       0.79      0.78      0.78       543

accuracy:   0.781


### Model Persistence

In [15]:
from sklearn.externals import joblib
joblib.dump(text_clf_svm, 'text_clf_svm.pkl')

['text_clf_svm.pkl']

In [19]:
classifier = joblib.load('text_clf_svm.pkl')
classifier.predict(['thanks. i got it'])

array(['positive'],
      dtype='<U8')

In [3]:
gs_clf.best_score_
gs_clf.best_params_

NameError: name 'gs_clf' is not defined

In [14]:
text_clf_svm.predict(['thanks. i got it'])

array(['positive'],
      dtype='<U8')

In [None]:
items = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
for index, item in enumerate(items):
    if not (item % 2):
        items[index] = None
print(items)

In [None]:
type(train)

# Code goes here