In [47]:
import nltk
from sklearn.svm import SVC 
import numpy as np

In [2]:
import pandas as pd
train = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')

In [35]:
train['text']
test['text']

0       @AmericanAir In car gng to DFW. Pulled over 1h...
1       @AmericanAir after all, the plane didn’t land ...
2       @SouthwestAir can't believe how many paying cu...
3       @USAirways I can legitimately say that I would...
4       @AmericanAir still no response from AA. great ...
5       @united we have developers flying down tmrw mo...
6                       @USAirways hello??? Anyone there?
7       @USAirways @husainhaqqani Mr. Husain u shld pr...
8       @USAirways not likely, flightaware says plane ...
9       @AmericanAir they don't even give an option to...
10      @united your announcement for pre boarding onl...
11      @USAirways it is really embarrassing when aski...
12      @SouthwestAir I will not have my passport in t...
13      @AmericanAir this delayed bag was for my frien...
14      @SouthwestAir Didn't see travel had to be comp...
15      @USAirways awesome... Doors close in 2 minutes...
16      @united I flew United last month and the exper...
17      @JetBl

In [3]:
documents = []
# for category in movie_reviews.categories():
#     for fileid in movie_reviews.fileids(category):
#         documents.append((movie_reviews.words(fileid), category))
        
for index, data in train.iterrows():
    documents.append((nltk.word_tokenize(data['text']), data['airline_sentiment']))

In [4]:
documents_test = []
for index, data in test.iterrows():
    documents_test.append(nltk.word_tokenize(data['text']))

In [5]:
import random
random.shuffle(documents)
documents[0:5]

[(['@',
   'AmericanAir',
   'thanks',
   'for',
   'the',
   'info',
   'Is',
   'there',
   'a',
   'number',
   'I',
   'can',
   'call',
   'to',
   'speak',
   'to',
   'a',
   'person',
   '?',
   'It',
   "'s",
   'going',
   'to',
   'take',
   'an',
   'hour',
   'to',
   'type',
   'it',
   'out'],
  'positive'),
 (['@',
   'USAirways',
   'Stop',
   'reposting',
   'same',
   'autoresponse',
   'That',
   'was',
   'return',
   'flight',
   'home',
   '#',
   'imateacher',
   '.',
   'Couldnt',
   'get',
   'to',
   'RSW',
   '#',
   'neptune',
   '#',
   'waivethefee',
   '#',
   'notmyfault'],
  'negative'),
 (['@',
   'AmericanAir',
   'Would',
   'love',
   'to',
   'DM',
   'you',
   ',',
   'but',
   'my',
   'Twitter',
   'app',
   'says',
   'you',
   "'re",
   'not',
   'following',
   'me',
   'and',
   'I',
   'ca',
   "n't",
   '.'],
  'negative'),
 (['@',
   'AmericanAir',
   'Oh',
   'they',
   'seem',
   'to',
   'have',
   'reappeared',
   'now',
   'lol'],
 

In [6]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [7]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [11]:
from nltk import pos_tag
from nltk.corpus import stopwords
import string
stops = set(stopwords.words('english'))
punctuations = list(string.punctuation)
stops.update(punctuations)

In [9]:
def clean_review(words):
    output_words = []
    for w in words:
        if w.lower() not in stops:
            pos = pos_tag([w])
            clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
            output_words.append(clean_word.lower())
    return output_words

In [12]:
documents = [(clean_review(document), category) for document, category in documents]

In [13]:
documents_test = [clean_review(document) for document in documents_test]

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
categories = [category for document, category in documents]
text_documents = [" ".join(document) for document, category in documents]
text_documents_test = [" ".join(document) for document in documents_test]

In [21]:
# from sklearn.model_selection import train_test_split
# x_train, x_test, y_train, y_test = train_test_split(text_documents, categories)

In [78]:
count_vec = CountVectorizer(max_features = 3000, ngram_range=(1,3))
x_train_features = count_vec.fit_transform(text_documents)
x_train_features.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ..., 
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [80]:
x_train_features

<10980x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 115681 stored elements in Compressed Sparse Row format>

In [81]:
text_documents_test_features = count_vec.transform(text_documents_test)

In [31]:
# x_test_features = count_vec.transform(x_test)

In [54]:
text_documents_test_features

<3660x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 37307 stored elements in Compressed Sparse Row format>

In [82]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [None]:
svc = SVC()
parameters = {'C':[1, 2, 3], 'gamma':[0.5, 0.1]}
# clf = GridSearchCV(svc, parameters)
clf = SVC(C=2, gamma=0.1)
clf.fit(x_train_features, categories)

In [75]:
clf.best_params_

{'C': 2, 'gamma': 0.1}

In [71]:
text_documents_test_features

<3660x3000 sparse matrix of type '<class 'numpy.int64'>'
	with 37307 stored elements in Compressed Sparse Row format>

In [76]:
Y_test = clf.predict(text_documents_test_features)

In [67]:
Y_test

array(['negative', 'neutral', 'negative', ..., 'neutral', 'positive',
       'negative'], 
      dtype='<U8')

In [77]:
np.savetxt('pred.txt', Y_test, delimiter=',', fmt="%s")