# Twitter Sentiment Analysis

### 1) Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import sent_tokenize , word_tokenize
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
import nltk
from nltk.classify.scikitlearn import SklearnClassifier

### 2) Loading Data........

In [2]:
training_data = pd.read_csv("training_data.csv")
testing_data = pd.read_csv("testing_data.csv")

In [3]:
training_data.shape, testing_data.shape

((10980, 12), (3660, 11))

In [4]:
training_data

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...
10975,569934458364813313,neutral,American,,Cottopanama85,,0,@AmericanAir followback,,2015-02-23 10:58:58 -0800,"ohio,panama",
10976,568564006329434113,positive,United,,PaulBEsteves,,0,@united thanks for the help. Wish the phone re...,,2015-02-19 16:13:17 -0800,Brooklyn,Eastern Time (US & Canada)
10977,569643648910028801,negative,US Airways,,runfixsteve,,0,@usairways the. Worst. Ever. #dca #customerser...,,2015-02-22 15:43:24 -0800,"St. Augustine, Florida",
10978,568864981917110272,negative,US Airways,,CLChicosky,,0,@nrhodes85: look! Another apology. DO NOT FLY ...,,2015-02-20 12:09:15 -0800,,


In [5]:
training_data.isnull().sum()

tweet_id                      0
airline_sentiment             0
airline                       0
airline_sentiment_gold    10949
name                          0
negativereason_gold       10956
retweet_count                 0
text                          0
tweet_coord               10204
tweet_created                 0
tweet_location             3550
user_timezone              3577
dtype: int64

### 3) Cleaning Data

#### A) Remove Irrelevant Columns 

In [6]:
## ALL the Columns other than text are irrelevant so just pick that column
x_train = training_data["text"].to_numpy()
y_train = training_data["airline_sentiment"].to_numpy()
x_test = testing_data["text"].to_numpy()

type(x_train), type(x_test)

(numpy.ndarray, numpy.ndarray)

In [7]:
x_train.shape

(10980,)

In [8]:
x_train = x_train.reshape(len(x_train), -1)
x_test = x_test.reshape(len(x_test), -1)
y_train = y_train.reshape(len(y_train), -1)
x_train.shape, y_train.shape, x_test.shape

((10980, 1), (10980, 1), (3660, 1))

In [9]:
x_train[0], y_train[0]

(array(['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled'],
       dtype=object),
 array(['negative'], dtype=object))

#### B) Tokkenize Everything

In [10]:
print(type(x_train[0]))
print(type(str(x_train[0])))

<class 'numpy.ndarray'>
<class 'str'>


In [11]:
training_data = []
for i in range(len(x_train)):
    training_data.append([word_tokenize(str(x_train[i])), y_train[i]]) 
    ## don't make ot tuple otherwise assignment isnot possible 
print(training_data[0][0])
print(training_data[0][1])

['[', "'", '@', 'SouthwestAir', 'I', 'am', 'scheduled', 'for', 'the', 'morning', ',', '2', 'days', 'after', 'the', 'fact', ',', 'yes..not', 'sure', 'why', 'my', 'evening', 'flight', 'was', 'the', 'only', 'one', 'Cancelled', 'Flightled', "'", ']']
['negative']


In [12]:
testing_data = []
for i in range(len(x_test)):
    testing_data.append(word_tokenize(str(x_test[i])))
print(testing_data[0])

['[', '``', '@', 'AmericanAir', 'In', 'car', 'gng', 'to', 'DFW', '.', 'Pulled', 'over', '1hr', 'ago', '-', 'very', 'icy', 'roads', '.', 'On-hold', 'with', 'AA', 'since', '1hr', '.', 'Ca', "n't", 'reach', 'arpt', 'for', 'AA2450', '.', 'Wat', '2', 'do', '?', "''", ']']


#### C) Remove Stop Words, alphanumeric word, word with len smaller than 2

In [13]:
stop = stopwords.words("english")
punctuations = list(string.punctuation)
stop += punctuations
print(stop)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [14]:
### From Training Data
for i in range(len(training_data)):
    output = []
    for word in training_data[i][0]:
        if not word.isalpha() or len(word) <= 2:
            continue
        if word not in stop:
            output.append(word)
    training_data[i][0] = output

print(training_data[0][0])

['SouthwestAir', 'scheduled', 'morning', 'days', 'fact', 'sure', 'evening', 'flight', 'one', 'Cancelled', 'Flightled']


In [15]:
### From Testing Data
for i in range(len(testing_data)):
    output = []
    for word in testing_data[i]:
        if not word.isalpha() or len(word) <= 2:
            continue
        if word not in stop:
            output.append(word)
    testing_data[i] = output
print(testing_data[0])

['AmericanAir', 'car', 'gng', 'DFW', 'Pulled', 'ago', 'icy', 'roads', 'since', 'reach', 'arpt', 'Wat']


#### D) DO Lemmatization

In [16]:
lemmatizer = WordNetLemmatizer()

In [17]:
def get_simple_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.ADV
    else :
        return wordnet.NOUN

In [18]:
def clean_review(words):
    output_words = []
    for w in words:
        pos = pos_tag([w])
        clean_word = lemmatizer.lemmatize(w, pos = get_simple_pos(pos[0][1]))
        output_words.append(clean_word.lower())
    return output_words

In [19]:
training_data = [ [clean_review(training_data[i][0]), training_data[i][1]] for i in range(len(training_data))]
testing_data = [ clean_review(testing_data[i]) for i in range(len(testing_data))]

In [20]:
print(training_data[0][0])
print(testing_data[0])

['southwestair', 'schedule', 'morning', 'days', 'fact', 'sure', 'even', 'flight', 'one', 'cancelled', 'flightled']
['americanair', 'car', 'gng', 'dfw', 'pulled', 'ago', 'icy', 'roads', 'since', 'reach', 'arpt', 'wat']


### 4) Create Feature Sets

In [21]:
training_text = [" ".join(training_data[i][0]) for i in range(len(training_data))]
testing_text = [" ".join(testing_data[i]) for i in range(len(testing_data))]
len(training_text), len(testing_text)

(10980, 3660)

In [22]:
count_vec = CountVectorizer(max_features = 2900)

In [23]:
x_train = count_vec.fit_transform(training_text)
x_test = count_vec.transform(testing_text)
y_train = np.array([training_data[i][1] for i in range(len(training_data))])
type(x_train), type(x_test), type(y_train) 

(scipy.sparse.csr.csr_matrix, scipy.sparse.csr.csr_matrix, numpy.ndarray)

In [24]:
x_train = x_train.todense()
x_test  = x_test.todense()
type(x_train), type(x_test), type(y_train) 

(numpy.matrix, numpy.matrix, numpy.ndarray)

In [25]:
x_train, x_test, y_train

(matrix([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 matrix([[0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         ...,
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0],
         [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 array([['negative'],
        ['positive'],
        ['positive'],
        ...,
        ['negative'],
        ['negative'],
        ['negative']], dtype=object))

### 5) Apply Model

#### a) SVC

In [26]:
svc = SVC()
svc.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [27]:
### Score on Training Data
svc.score(x_train, y_train)

0.9061930783242259

In [28]:
### Prediction on Testing data
y_test_pred = svc.predict(x_test)
df = pd.DataFrame(y_test_pred)
np.savetxt("Prediction_SVC.csv", y_test_pred, fmt = "%s")

In [29]:
### 77.98% accuracy on Coding Ninjas

#### B) MultiNomial Naive Bayes

In [30]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

clf = MultinomialNB()
clf.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [31]:
### Score on Training Data
y_train_pred = clf.predict(x_train)
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print(clf.score(x_train, y_train))

              precision    recall  f1-score   support

    negative       0.87      0.91      0.89      6851
     neutral       0.74      0.62      0.67      2327
    positive       0.79      0.79      0.79      1802

    accuracy                           0.83     10980
   macro avg       0.80      0.77      0.78     10980
weighted avg       0.82      0.83      0.83     10980

[[6249  393  209]
 [ 712 1436  179]
 [ 261  124 1417]]
0.8289617486338798


In [32]:
### Score on Testing Data
y_test_pred = clf.predict(x_test)
np.savetxt("Prediction_MNB.csv", y_test_pred, fmt = "%s")
### 38.58% accuracy on Coding Ninjas

#### C) Gaussian Naive Bayes

In [33]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix


clf = GaussianNB()
clf.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


GaussianNB(priors=None, var_smoothing=1e-09)

In [34]:
### Score on Training Data
y_train_pred = clf.predict(x_train)
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print(clf.score(x_train, y_train))

              precision    recall  f1-score   support

    negative       1.00      0.40      0.57      6851
     neutral       0.42      0.42      0.42      2327
    positive       0.31      1.00      0.47      1802

    accuracy                           0.50     10980
   macro avg       0.57      0.61      0.49     10980
weighted avg       0.76      0.50      0.52     10980

[[2758 1367 2726]
 [   0  975 1352]
 [   0    1 1801]]
0.5040072859744991


In [35]:
### Score on Testing data
y_test_pred = clf.predict(x_test)
np.savetxt("Prediction_GNB.csv", y_test_pred, fmt = "%s")
#### 38.58% accuracy on Coding Ninjas

#### D) Random Forest

In [36]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(x_train, y_train)

  This is separate from the ipykernel package so we can avoid doing imports until


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [37]:
### Score on Training Data
y_train_pred = clf.predict(x_train)
print(classification_report(y_train, y_train_pred))
print(confusion_matrix(y_train, y_train_pred))
print(clf.score(x_train, y_train))

              precision    recall  f1-score   support

    negative       1.00      1.00      1.00      6851
     neutral       0.98      0.99      0.98      2327
    positive       0.98      0.99      0.99      1802

    accuracy                           0.99     10980
   macro avg       0.99      0.99      0.99     10980
weighted avg       0.99      0.99      0.99     10980

[[6823   26    2]
 [   3 2297   27]
 [   4   16 1782]]
0.992896174863388


In [38]:
### Score on Testing data
y_test_pred = clf.predict(x_test)
np.savetxt("Prediction_RFC.csv", y_test_pred, fmt = "%s")
#### 74.64% accuracy on Coding Ninjas