In [1]:
import numpy as np
import pandas as pd

import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [2]:
eng_stopwords = stopwords.words('english')
porter = PorterStemmer()
tokenizer = TweetTokenizer()


In [3]:
train=pd.read_csv("training_twitter_x_y_train.csv").fillna(" ")
test=pd.read_csv("test_twitter_x_test.csv").fillna(" ")

In [4]:
train

Unnamed: 0,tweet_id,airline_sentiment,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,567900433542488064,negative,Southwest,,ColeyGirouard,,0,"@SouthwestAir I am scheduled for the morning, ...",,2015-02-17 20:16:29 -0800,Washington D.C.,Atlantic Time (Canada)
1,569989168903819264,positive,Southwest,,WalterFaddoul,,0,@SouthwestAir seeing your workers time in and ...,,2015-02-23 14:36:22 -0800,"Indianapolis, Indiana; USA",Central Time (US & Canada)
2,568089179520954368,positive,United,,LocalKyle,,0,@united Flew ORD to Miami and back and had gr...,,2015-02-18 08:46:29 -0800,Illinois,Central Time (US & Canada)
3,568928195581513728,negative,Southwest,,amccarthy19,,0,@SouthwestAir @dultch97 that's horse radish 😤🐴,,2015-02-20 16:20:26 -0800,,Atlantic Time (Canada)
4,568594180014014464,negative,United,,J_Okayy,,0,@united so our flight into ORD was delayed bec...,,2015-02-19 18:13:11 -0800,,Eastern Time (US & Canada)
...,...,...,...,...,...,...,...,...,...,...,...,...
10975,569934458364813313,neutral,American,,Cottopanama85,,0,@AmericanAir followback,,2015-02-23 10:58:58 -0800,"ohio,panama",
10976,568564006329434113,positive,United,,PaulBEsteves,,0,@united thanks for the help. Wish the phone re...,,2015-02-19 16:13:17 -0800,Brooklyn,Eastern Time (US & Canada)
10977,569643648910028801,negative,US Airways,,runfixsteve,,0,@usairways the. Worst. Ever. #dca #customerser...,,2015-02-22 15:43:24 -0800,"St. Augustine, Florida",
10978,568864981917110272,negative,US Airways,,CLChicosky,,0,@nrhodes85: look! Another apology. DO NOT FLY ...,,2015-02-20 12:09:15 -0800,,


In [5]:
y_train=train["airline_sentiment"]
y_train1=[]
for i in range(len(y_train)):
    if y_train[i]=="positive":
        y_train1.append(1)
    elif y_train[i]=="negative":
        y_train1.append(-1)
    else:
        y_train1.append(0)

In [6]:
train_text = train['text']
test_text = test['text']
all_text = pd.concat([train_text, test_text])

In [7]:
class_names=["airline_sentiment"]

In [8]:
def clean(text):
    """
    This function receives comments and returns clean word-list
    """
    text = text.lower()
    text = re.sub('[\\n]+', ' ', text)
    text = re.sub('[\W]+', ' ', text)

    words = tokenizer.tokenize(text)

    words = [porter.stem(word) for word in words]
    words = [w for w in words if w not in eng_stopwords]

    text_cleaned = " ".join(words)

    return(text_cleaned)

In [9]:
all_text.apply(clean)

0       southwestair schedul morn 2 day fact ye sure w...
1       southwestair see worker time time go abov beyo...
2       unit flew ord miami back great crew servic leg...
3                      southwestair dultch 97 hors radish
4       unit flight ord wa delay becaus air forc one l...
                              ...                        
3655    usairway stuck 40 minut due lavatori issu beve...
3656       usairway 4 hour 4 hour four hour like thi joke
3657    nice rt virginamerica man steel might faster w...
3658    americanair aww thank aa dfw wa gma thi unders...
3659    unit loung tell us pillow grandma one ladi ope...
Name: text, Length: 14640, dtype: object

In [10]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 2),
    max_features=3000)
word_vectorizer.fit(all_text)
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

In [11]:
char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='char',
    stop_words='english',
    ngram_range=(2, 6),
    max_features=1000)
char_vectorizer.fit(all_text)
train_char_features = char_vectorizer.transform(train_text)
test_char_features = char_vectorizer.transform(test_text)



In [12]:
from scipy.sparse import hstack
train_features = hstack([train_char_features, train_word_features])
test_features = hstack([test_char_features, test_word_features])


In [13]:
classifier = LogisticRegression(C=0.1, solver='sag')
classifier.fit(train_features,y_train1 )

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='sag', tol=0.0001, verbose=0,
                   warm_start=False)

In [14]:
predict=classifier.predict(test_features)

In [15]:
y_pred=[]
for i in range(len(predict)):
    if predict[i]==1:
        y_pred.append("positive")
    elif predict[i]==-1:
        y_pred.append("negative")
    else:
        y_pred.append("neutral")

In [16]:
y_pred

['negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'neutral',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'positive',
 'negative',
 'neutral',
 'negative',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'positive',
 'positive',
 'negative',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'nega

In [17]:
frame=pd.DataFrame(y_pred)
frame

Unnamed: 0,0
0,negative
1,negative
2,negative
3,negative
4,negative
...,...
3655,negative
3656,negative
3657,negative
3658,positive


In [19]:
frame.to_csv("pred.csv",index=False,header=None)