### Importing libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

### Importing the Dataset

In [2]:
file = 'tweets_public.csv'

In [3]:
dataset = pd.read_csv(file)

In [4]:
dataset.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,569237160886276096,negative,1.0,Can't Tell,0.6543,Delta,,venkatesh_cr,,0,@JetBlue I've been in pricing for 8 years to k...,,2015-02-21 12:48:09 -0800,Austin Texas,Central Time (US & Canada)
1,569267194028298241,negative,1.0,Customer Service Issue,1.0,Southwest,,ChristineFlores,,0,"@SouthwestAir AH - did DM, no reply. On hold n...",,2015-02-21 14:47:30 -0800,,Central Time (US & Canada)
2,569506670189137920,negative,0.6473,Lost Luggage,0.6473,United,,szymanski_t,,0,@united if you lost my belongings then BE HONEST!,,2015-02-22 06:39:05 -0800,,Eastern Time (US & Canada)
3,570293957739081728,negative,1.0,Customer Service Issue,1.0,United,,nate2482,,0,@United the internet is a great thing. I am e...,,2015-02-24 10:47:29 -0800,"Parkersburg, WV",Eastern Time (US & Canada)
4,570212129313316864,neutral,1.0,,,Delta,,elias_rubin,,0,@JetBlue I believe that the website said I cou...,,2015-02-24 05:22:20 -0800,"New York, NY",Pacific Time (US & Canada)


### Text cleaning

In [5]:
# Removing Numeric and non Alphanumeric characters
tweet = re.sub('[^a-zA-Z]', ' ', dataset['text'][0])

tweet

' JetBlue I ve been in pricing for   years to know that    bucks a seat is criminal        I understand   pricing  flying  jetblue  pricewise'

In [6]:
# Lowercase every word

tweet = tweet.lower()

tweet

' jetblue i ve been in pricing for   years to know that    bucks a seat is criminal        i understand   pricing  flying  jetblue  pricewise'

In [7]:
# Transform text to list

tweet = tweet.split()

tweet

['jetblue',
 'i',
 've',
 'been',
 'in',
 'pricing',
 'for',
 'years',
 'to',
 'know',
 'that',
 'bucks',
 'a',
 'seat',
 'is',
 'criminal',
 'i',
 'understand',
 'pricing',
 'flying',
 'jetblue',
 'pricewise']

In [8]:
# Retutn just the root of words to avoid things like love,loved,loves...

ps = PorterStemmer()

tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]

tweet

['jetblu',
 'price',
 'year',
 'know',
 'buck',
 'seat',
 'crimin',
 'understand',
 'price',
 'fli',
 'jetblu',
 'pricewis']

In [9]:
# Making a text from the list of words separated by a space

tweet = ' '.join(tweet)

tweet

'jetblu price year know buck seat crimin understand price fli jetblu pricewis'

In [10]:
# Looping thourgh all the dataset to clean all rows
corpus = []

for i in range(len(dataset.index)):
    tweet = re.sub('[^a-zA-Z]', ' ', dataset['text'][i])
    tweet = tweet.lower()
    tweet = tweet.split()
    
    ps = PorterStemmer()

    tweet = [ps.stem(word) for word in tweet if not word in set(stopwords.words('english'))]
    
    tweet = ' '.join(tweet)
    
    corpus.append(tweet)

In [11]:
corpus

['jetblu price year know buck seat crimin understand price fli jetblu pricewis',
 'southwestair ah dm repli hold hr spent k get unit flight tmrw get home lame',
 'unit lost belong honest',
 'unit internet great thing email execut compani mayb respond time manner',
 'jetblu believ websit said could receiv credit upcom flight sinc cancel flight last one true',
 'littl late flight suck rt usairway mitchsunderland oh mitchel agent happi offer avail option',
 'usairway tri request miss mileag keep say flight say call number',
 'virginamerica achiev second year profit despit revenu pressur capa aviat http co zsuztnaijq',
 'jetblu departur time keep get late flightr lucki home',
 'americanair learn flight cancel flightl get phone see option assist onlin suggest',
 'unit sinc intern connect hope thing',
 'jetblu point delay take differ flight destin',
 'unit wors head laguardia delay tray tabl size mous pad overhead size pocket',
 'americanair usairway complaint visit custom servic desk see li

### Bag of words model

In [12]:
cv = CountVectorizer(max_features=100)

X = cv.fit_transform(corpus).toarray()

print (X,X.shape)

[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]] (8784, 100)


### Set dependent variable (reviews)

In [13]:
y = dataset.iloc[:,10].values

y

array([ "@JetBlue I've been in pricing for 8 years to know that 70 bucks a seat is criminal. 20-30 I understand. #pricing #flying #jetblue #pricewise",
       '@SouthwestAir AH - did DM, no reply. On hold now over 2hrs. Just spent over $1k to get a United flight tmrw to get home. #lame',
       '@united if you lost my belongings then BE HONEST!', ...,
       '@JetBlue flight 1183 to Orlando.',
       "@JetBlue Why not deal with that while the plane's on the ground instead of diverting the plane &amp; adding 2 hrs to the flight?",
       "See what you started now @nytimes RT @JetBlue: Our fleet's on fleek. http://t.co/atd2Sm8HF4"], dtype=object)

### Classification model: Naive bayes

In [14]:
# Split to train/test sets 

from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state = 0)



In [15]:
# Fitting Naive Bayes to the training set

from sklearn.naive_bayes import GaussianNB

classifier = GaussianNB()

classifier.fit(X_train, y_train)

GaussianNB(priors=None)

In [16]:
# Predicting the Test set results

y_pred = classifier.predict(X_test)