In [1]:
import os
import time
import pandas as pd
import numpy as np
import csv
import string
import matplotlib.pyplot as plt
import seaborn as sns
import random
import itertools
import collections
from collections import Counter

import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import * 

import warnings
warnings.filterwarnings("ignore")

from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

In [9]:
# load data
training = pd.read_csv('feat_eng_train_data.csv')

# remove rows with none values
training = training.dropna(0, 'any')

# Features TODO: correct feature names
features = ['tokens', 'neu_scores', 'neg_scores', 'compound_scores', 'pos_scores']
label = ['score']

# Saving features and label data in X and y for train-test split
X = training[[col for col in training.columns if col in features]]
y = training[label]

# splitting data into training and validation set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=0)

training.head(3)

Unnamed: 0,score,tokens,neg_scores,neu_scores,pos_scores,compound_scores,NOUN,PRON,VERB,ADJ,ADV
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,thought sleep option tomorrow realiz evalu mor...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,0.0,life cool,0.0,0.303,0.697,0.3182,1.0,0.0,1.0,0.0,0.0


In [10]:
# Helper functions from gracecarrillo

class TextSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on text columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
# Pipeline to convert tweets to a matrix of TF-IDF features.
tfidf = Pipeline([
                ('selector', TextSelector(key='tokens')),
                ('tfidf', TfidfVectorizer())
            ])

# Pipeline to convert tweets to a matrix of token counts
countvect = Pipeline([
                ('selector', TextSelector(key='tokens')),
                ('countvect', CountVectorizer())
            ])

# Applying tfidf anf countvec to features
neu_scores =  Pipeline([
                ('selector', NumberSelector(key='neu_scores')),
                ('minmax', MinMaxScaler())
            ])
neg_scores =  Pipeline([
                ('selector', NumberSelector(key='neg_scores')),
                ('minmax', MinMaxScaler())
            ])
pos_scores =  Pipeline([
                ('selector', NumberSelector(key='pos_scores')),
                ('minmax', MinMaxScaler())
            ])

compound_scores =  Pipeline([
                ('selector', NumberSelector(key='compound_scores')),
                ('minmax', MinMaxScaler())
            ])

In [11]:
# load data
testing = pd.read_csv('cleaned_test_data.csv')

# remove rows with none values
testing = testing.dropna(0, 'any')

testing.head(3)

Unnamed: 0,tokens
0,republican district longer hour democrat distr...
1,rememb ratherg 60 minut got dan rather fire to...
2,oh republican presid republic…


In [12]:
# load the naive-bayes model
nb_model = joblib.load('twitter_sentiment_naivebayes.pkl')
result_nb = nb_model.score(X_test, y_test)

In [13]:
# load the naive-bayes model
svm_model = joblib.load('twitter_sentiment_svm.pkl')
result_svm = svm_model.score(X_test, y_test)

In [15]:
# load testing data
testing = pd.read_csv('cleaned_test_data.csv')

# remove rows with none values
testing = testing.dropna(0, 'any')

testing.head(3)

Unnamed: 0,tokens,neg_scores,neu_scores,pos_scores,compound_scores
0,republican district longer hour democrat distr...,0.0,1.0,0.0,0.0
1,rememb ratherg 60 minut got dan rather fire to...,0.167,0.833,0.0,-0.34
2,oh republican presid republic…,0.0,1.0,0.0,0.0


In [16]:
# predictions

# naive-bayes
y_nb_predictions = nb_model.predict(testing)

In [18]:
# SVM
y_svm_predictions = svm_model.predict(testing)