# Preparing Training Data

## Importing Training Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
df_train=pd.read_csv('train.csv')
df_train

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...
10975,569934458364813313,neutral,American,,Cottopanama85,,0,@AmericanAir followback,,2015-02-23 10:58:58 -0800,"ohio,panama",
10976,568564006329434113,positive,United,,PaulBEsteves,,0,@united thanks for the help. Wish the phone re...,,2015-02-19 16:13:17 -0800,Brooklyn,Eastern Time (US & Canada)
10977,569643648910028801,negative,US Airways,,runfixsteve,,0,@usairways the. Worst. Ever. #dca #customerser...,,2015-02-22 15:43:24 -0800,"St. Augustine, Florida",
10978,568864981917110272,negative,US Airways,,CLChicosky,,0,@nrhodes85: look! Another apology. DO NOT FLY ...,,2015-02-20 12:09:15 -0800,,


In [3]:
df_train=df_train[['text','airline_sentiment']]
df_train

Unnamed: 0,text,airline_sentiment
0,"@SouthwestAir I am scheduled for the morning, ...",negative
1,@SouthwestAir seeing your workers time in and ...,positive
2,@united Flew ORD to Miami and back and had gr...,positive
3,@SouthwestAir @dultch97 that's horse radish 😤🐴,negative
4,@united so our flight into ORD was delayed bec...,negative
...,...,...
10975,@AmericanAir followback,neutral
10976,@united thanks for the help. Wish the phone re...,positive
10977,@usairways the. Worst. Ever. #dca #customerser...,negative
10978,@nrhodes85: look! Another apology. DO NOT FLY ...,negative


In [4]:
training_data=df_train.values
training_data

array([['@SouthwestAir I am scheduled for the morning, 2 days after the fact, yes..not sure why my evening flight was the only one Cancelled Flightled',
        'negative'],
       ['@SouthwestAir seeing your workers time in and time out going above and beyond is why I love flying with you guys. Thank you!',
        'positive'],
       ['@united Flew ORD to Miami and back and  had great crew, service on both legs. THANKS',
        'positive'],
       ...,
       ['@usairways the. Worst. Ever. #dca #customerservice', 'negative'],
       ['@nrhodes85: look! Another apology. DO NOT FLY @USAirways',
        'negative'],
       ['@united you are by far the worst airline. 4 plane delays on 1 round trip flight. How is that possible.',
        'negative']], dtype=object)

### Splitting text into words

In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [6]:
tweets_train=[]
for i in range(len(training_data)):
    tweets_train.append([word_tokenize(training_data[i][0]) ,training_data[i][1]])


## Cleaning the Words using WordNetLemmatizer available in NLTK

In [7]:
import string
from nltk.corpus import stopwords

In [8]:
stop=(stopwords.words('english'))
punc=list(string.punctuation)
stop=stop+punc


In [9]:
from nltk.corpus import wordnet
def get_simple_pos(tag):
    
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [10]:
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

In [11]:
lemmatizer=WordNetLemmatizer()
def clean_tweets(words):
    output_words=[]
    for w in words:
        if w.isalpha():
             if w.lower() not in stop:
                    pos=pos_tag([w])
                    clean_words=lemmatizer.lemmatize(w,pos=get_simple_pos(pos[0][1]))
                    output_words.append(clean_words.lower())
    return output_words
            
            

In [12]:
for i in range(len(tweets_train)):
    tweets_train[i]=(clean_tweets(tweets_train[i][0]),tweets_train[i][1])

In [15]:
y_train=[]
tweets=[]
for tweet,sentiment in tweets_train:
    y_train.append(sentiment)
    tweets.append(" ".join(tweet))

## Using Count Vectorizer to get the X Train

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
count_vec=CountVectorizer(max_features=2000)
x_train_features=count_vec.fit_transform(tweets)

In [23]:
x_train_features

<10980x2000 sparse matrix of type '<class 'numpy.int64'>'
	with 91538 stored elements in Compressed Sparse Row format>

# Preparing Testing Data

In [25]:
df_test=pd.read_csv("train.csv")
df_test

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...
10975,569934458364813313,neutral,American,,Cottopanama85,,0,@AmericanAir followback,,2015-02-23 10:58:58 -0800,"ohio,panama",
10976,568564006329434113,positive,United,,PaulBEsteves,,0,@united thanks for the help. Wish the phone re...,,2015-02-19 16:13:17 -0800,Brooklyn,Eastern Time (US & Canada)
10977,569643648910028801,negative,US Airways,,runfixsteve,,0,@usairways the. Worst. Ever. #dca #customerser...,,2015-02-22 15:43:24 -0800,"St. Augustine, Florida",
10978,568864981917110272,negative,US Airways,,CLChicosky,,0,@nrhodes85: look! Another apology. DO NOT FLY ...,,2015-02-20 12:09:15 -0800,,


In [26]:
testing_data=np.array(df_test['text'])

In [27]:
tweets_test=[]
for i in testing_data:
    i=clean_tweets(word_tokenize(i))
    tweets_test.append(" ".join(i))

In [28]:
x_test_features=count_vec.transform(tweets_test)

# Performing Classification

### Support Vector Machine

In [31]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

In [32]:
svc=SVC()
svc.fit(x_train_features,y_train)

In [33]:
y_pred_svm=svc.predict(x_test_features)

In [34]:
df=pd.DataFrame(y_pred_svm)
df.to_csv('prediction_svm.csv', index=False, header=False)

### Random Forest Classifier

In [36]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
rf=RandomForestClassifier()
rf.fit(x_train_features,y_train)

In [39]:
y_pred_rf=rf.predict(x_test_features)

In [40]:
df=pd.DataFrame(y_pred_rf)
df.to_csv('prediction_rf.csv', index=False, header=False)

### Multinomial Naive Bayes

In [52]:
from  sklearn.naive_bayes import MultinomialNB 

In [53]:
mnv=MultinomialNB()
mnv.fit(x_train_features,y_train)

In [54]:
y_pred_mnv=mnv.predict(x_test_features)

In [55]:
df=pd.DataFrame(y_pred_mnv)
df.to_csv('prediction_mnv.csv', index=False, header=False)

### Descision Tree

In [56]:
from sklearn import tree


In [57]:
dt=tree.DecisionTreeClassifier()
dt.fit(x_train_features,y_train)

In [58]:
y_pred_dt=dt.predict(x_test_features)

In [59]:
df=pd.DataFrame(y_pred_dt)
df.to_csv('prediction_dt.csv', index=False, header=False)