In [63]:
import pandas as pd
import numpy as np
import tensorflow as tf
from keras.layers import SimpleRNN, Dense, Dropout
from keras.utils import np_utils
from keras.models import Sequential 
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import NaiveBayesClassifier
from sklearn.ensemble import RandomForestClassifier

In [9]:
train_data = pd.read_csv("C:\\Users\\Archit\\Desktop\\MachineLearning\\ML_projects\\twitter\\0000000000002747_training_twitter_x_y_train.csv")
test_data = pd.read_csv("C:\\Users\\Archit\\Desktop\\MachineLearning\\ML_projects\\twitter\\0000000000002747_test_twitter_x_test.csv")

In [10]:
train_data.shape, test_data.shape

((10980, 12), (3660, 11))

In [11]:
x_train = train_data['text']
y_train = train_data['airline_sentiment']
x_test = test_data['text']
x_train.shape, y_train.shape, x_test.shape

((10980,), (10980,), (3660,))

In [13]:
stop = stopwords.words('english')
punct = list(string.punctuation)
stop_words = stop+punct
stop_words[:5], len(stop_words)

(['i', 'me', 'my', 'myself', 'we'], 211)

In [14]:
# creating document by appending input and output data.
# here we've 3 types of classes i.e negative, positive and neutral.
documents = []
for i in range(len(x_train)):
    documents.append((x_train[i], y_train[i]))
documents[:2]

[('@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled',
  'negative'),
 ('@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!',
  'positive')]

In [15]:
from nltk.corpus import wordnet
def get_pos_tag_value(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
lemma = WordNetLemmatizer()

In [20]:
word_tokenize(documents[0][0])

['@',
 'SouthwestAir',
 'I',
 'am',
 'scheduled',
 'for',
 'the',
 'morning',
 ',',
 '2',
 'days',
 'after',
 'the',
 'fact',
 ',',
 'yes..not',
 'sure',
 'why',
 'my',
 'evening',
 'flight',
 'was',
 'the',
 'only',
 'one',
 'Cancelled',
 'Flightled']

In [21]:
# removing stop_words addig posTag in documents.
def clean_text(text):
    clean_text = []
    text_lst = word_tokenize(text)
    #print(text_lst,end=" ")
    for t in text_lst:
        if t.lower() not in stop_words:
            postag = pos_tag([t])
            #print(type(t))
            clean_word = lemma.lemmatize(t, pos = get_pos_tag_value(postag[0][1]))
            clean_text.append(clean_word.lower())
    return clean_text

In [22]:
clean_documents = [(clean_text(text), response) for text, response in documents]

In [23]:
print(clean_documents[0][0], clean_documents[0][1], end=" ")

['southwestair', 'schedule', 'morning', '2', 'day', 'fact', 'yes..not', 'sure', 'even', 'flight', 'one', 'cancelled', 'flightled'] negative 

In [24]:
type(clean_documents), len(clean_documents)

(list, 10980)

In [25]:
x_test_clean = [clean_text(t) for t in x_test]
print(x_test_clean[0])
print(x_test[0])

['americanair', 'car', 'gng', 'dfw', 'pulled', '1hr', 'ago', 'icy', 'road', 'on-hold', 'aa', 'since', '1hr', 'ca', "n't", 'reach', 'arpt', 'aa2450', 'wat', '2']
@AmericanAir In car gng to DFW. Pulled over 1hr ago - very icy roads. On-hold with AA since 1hr. Can't reach arpt for AA2450. Wat 2 do?


In [44]:
all_words = []
for doc in clean_documents:
    all_words += doc[0]
print(all_words[:10],end=' ')

['southwestair', 'schedule', 'morning', '2', 'day', 'fact', 'yes..not', 'sure', 'even', 'flight'] 

In [45]:
len(all_words)

119926

In [46]:
import nltk
freq = nltk.FreqDist(all_words)
most_common = freq.most_common(3400)

In [47]:
l1 = ['...',  "''", '``',  "''", '``', ]
print(most_common[:10], end=" ")

[('flight', 3342), ('united', 3178), ('usairways', 2253), ('americanair', 2234), ('southwestair', 1841), ('jetblue', 1762), ('get', 1542), ("n't", 1535), ("'s", 1113), ('http', 906)] 

In [48]:
features = []
for f in most_common:
    if f not in l1:
        features.append(f[0])
# features = features[10:]
# features.remove('``')
# features.remove("''")
# features.remove('’')
# features.remove('..')

# features.remove('...')
# features.remove('u')
# features.remove('..')
# features.remove('..')
# features.remove('..')
len(features)

3400

In [49]:
print(features[:10], end="")

['flight', 'united', 'usairways', 'americanair', 'southwestair', 'jetblue', 'get', "n't", "'s", 'http']

In [51]:
def feature_dic(words, features):
    f_dictionary = {}
    word_set = set(words)
    for f in features:
        cnt = 0
        for w in word_set:
            if f == w:
                cnt+=1
        f_dictionary[f] = cnt
    return f_dictionary

In [55]:
training_data = [(feature_dic(doc, features), category) for doc, category in clean_documents]
#print(training_data[:1],end=" ")

In [53]:
#print(x_test_clean[0],end="")
x_test_tokenize = [word_tokenize(x) for x in x_test] 
print(x_test_tokenize[0], end=" ")

['@', 'AmericanAir', 'In', 'car', 'gng', 'to', 'DFW', '.', 'Pulled', 'over', '1hr', 'ago', '-', 'very', 'icy', 'roads', '.', 'On-hold', 'with', 'AA', 'since', '1hr', '.', 'Ca', "n't", 'reach', 'arpt', 'for', 'AA2450', '.', 'Wat', '2', 'do', '?'] 

In [57]:
testing_data = [(feature_dic(doc, features)) for doc in x_test_tokenize]
#print(testing_data[:2], end=" ")

In [59]:
clf = NaiveBayesClassifier.train(training_data) 
# module 'nltk.classify' has no attribute 'predict'

In [60]:
# Accuracy by selecting top_most 3400 words
nltk.classify.accuracy(clf, training_data)

0.8450819672131148

In [61]:
nltk_y_pred = clf.classify_many(testing_data)

In [62]:
len(nltk_y_pred)

3660

In [64]:
all_output_train = [response for text, response in clean_documents]
all_input_train = [" ".join(text) for text, response in clean_documents]
#all_input_test =  [" ".join(text) for text in x_test_clean]
all_input_test =  [" ".join(text) for text in x_test_tokenize]
print(all_input_train[0])
print()
print(all_input_test[0])

southwestair schedule morning 2 day fact yes..not sure even flight one cancelled flightled

@ AmericanAir In car gng to DFW . Pulled over 1hr ago - very icy roads . On-hold with AA since 1hr . Ca n't reach arpt for AA2450 . Wat 2 do ?


In [65]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [66]:
count_vec = CountVectorizer(max_features = 12500, ngram_range = (1, 2))
x_train_new = count_vec.fit_transform(all_input_train)
x_test_new = count_vec.transform(all_input_test)
y_train_new = all_output_train

In [67]:
rf = RandomForestClassifier()
rf.fit(x_train_new, y_train_new)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [68]:
rf.score(x_train_new, y_train_new)

0.9811475409836066

In [69]:
y_predrf = rf.predict(x_test_new)

In [70]:
svc = SVC(750, kernel = 'rbf')
svc.fit(x_train_new, y_train_new)

SVC(C=750, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [71]:
svc.score(x_train_new, y_train_new)

0.8937158469945355

In [72]:
y_predsvm = svc.predict(x_test_new)
len(y_predsvm)

3660

In [1]:
# import numpy
# numpy.savetxt('twitterResult.csv', y_predrf, delimiter = ",", fmt="%s")