In [1]:
# Import libraries
import pandas as pd
import collections
import re
import nltk
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
import numpy as np
from scipy.sparse import hstack
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

In [2]:
def normalizer(tweet):
    only_letters = re.sub("[^a-zA-Z]", " ",tweet) 
    tokens = nltk.word_tokenize(only_letters)[2:]
    lower_case = [l.lower() for l in tokens]
    stop_words = set(stopwords.words('english'))
    wordnet_lemmatizer = WordNetLemmatizer()
    filtered_result = list(filter(lambda l: l not in stop_words, lower_case))
    lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
    return lemmas

In [3]:
def ngrams(input_list):
    #onegrams = input_list
    bigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:]))]
    trigrams = [' '.join(t) for t in list(zip(input_list, input_list[1:], input_list[2:]))]
    return bigrams+trigrams

In [4]:
def count_words(input):
    cnt = collections.Counter()
    for row in input:
        for word in row:
            cnt[word] += 1
    return cnt

In [5]:
def sentiment2target(sentiment):
    return {
        'negative': 0,
        'neutral': 1,
        'positive' : 2
    }[sentiment]

### Read in and Preprocess data

In [6]:
data = pd.read_csv("Tweets.csv")

# Setting this so we can see the full content of cells
pd.set_option('display.max_colwidth', -1)
data['normalized_tweet'] = data.text.apply(normalizer)
data[['text','normalized_tweet']].head()

data['grams'] = data.normalized_tweet.apply(ngrams)

### Data Preparation

In [7]:
# Create inputs and outputs
count_vectorizer = CountVectorizer(ngram_range=(1,2))
vectorized_data = count_vectorizer.fit_transform(data.text)

X = hstack((np.array(range(0,vectorized_data.shape[0]))[:,None], vectorized_data))
y = data.airline_sentiment.apply(sentiment2target)

### Split the data with the train_test_split

In [8]:
### Split training set and testing set ###
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

X_train_index = X_train[:,0]
X_train = X_train[:,1:]
X_test_index = X_test[:,0]
X_test = X_test[:,1:]

OneVsRestClassifier: Allows us to get the probability distribution over all three classes. (The one vs all method) 
Each of these classifiers determines the probability that the datapoint belongs to it's corresponding class, or any of the other classes. 

### RBF kernel

In [9]:
clf_best = OneVsRestClassifier(SVC(kernel='rbf', random_state=0, gamma=0.01, C = 10.0))
clf_best.fit(X_train, y_train)
clf_best.score(X_test, y_test)

0.80225409836065575

In [10]:
y_best = clf_best.predict(X_test)
print(precision_score(y_test, y_best, average = "macro"))
print(recall_score(y_test, y_best, average = "macro"))

0.755260420193
0.727152918914


In [11]:
clf1 = OneVsRestClassifier(SVC(kernel='rbf', random_state=0, gamma=0.01, C = 100.0))
clf1.fit(X_train, y_train)
clf1.score(X_test, y_test)

0.80157103825136611

In [12]:
clf2 = OneVsRestClassifier(SVC(kernel='rbf', random_state=0, gamma=0.01, C = 1000.0))
clf2.fit(X_train, y_train)
clf2.score(X_test, y_test)

0.79918032786885251

In [13]:
clf3 = OneVsRestClassifier(SVC(kernel='rbf', random_state=0, gamma=0.001, C = 100.0))
clf3.fit(X_train, y_train)
clf3.score(X_test, y_test)

0.79918032786885251

### Linear kernel

In [14]:
clf4 = OneVsRestClassifier(SVC(kernel='linear', C = 1000.0))
clf4.fit(X_train, y_train)
clf4.score(X_test, y_test)

0.7824453551912568

### Sigmoid kernel

In [15]:
clf5 = OneVsRestClassifier(SVC(kernel='sigmoid',  C = 1000.0))
clf5.fit(X_train, y_train)
clf5.score(X_test, y_test)

0.77527322404371579

### Polynomial kernel

In [16]:
clf6 = OneVsRestClassifier(SVC(kernel='poly', degree=4, C = 1000.0))
clf6.fit(X_train, y_train)
clf6.score(X_test, y_test)

0.63866120218579236

### Confusion Matrix for the model with rbf kernel

In [17]:
from sklearn.metrics import confusion_matrix
print("Confusion Matrix:\n", confusion_matrix(y_test, y_best))

Confusion Matrix:
 [[1673  151   46]
 [ 191  378   45]
 [  86   60  298]]
