In [1]:
import pandas as pd
import nltk, re, pprint,random
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.probability import FreqDist
from nltk.classify import SklearnClassifier
from nltk import pos_tag, word_tokenize, NaiveBayesClassifier, classify, bigrams
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import string
import re, math
from collections import Counter



In [2]:
# read in airline reviews
airline_data = pd.read_csv('airline.csv', encoding = 'latin1')

In [3]:
airline_data.shape

(41396, 20)

In [4]:
#create a smaller dataframe of just the airlines name and the review for easier analysis.
airline = airline_data[['airline_name','content']]

In [5]:
# add the name of the airline to the review for calculating lift frequency 
#with other airlines mentioned in the review
content_air=airline['airline_name']+' '+ airline['content']
content_air
airline['content2']=content_air

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [6]:
airline[:3]

Unnamed: 0,airline_name,content,content2
0,adria-airways,Outbound flight FRA/PRN A319. 2 hours 10 min f...,adria-airways Outbound flight FRA/PRN A319. 2 ...
1,adria-airways,Two short hops ZRH-LJU and LJU-VIE. Very fast ...,adria-airways Two short hops ZRH-LJU and LJU-V...
2,adria-airways,Flew Zurich-Ljubljana on JP365 newish CRJ900. ...,adria-airways Flew Zurich-Ljubljana on JP365 n...


In [7]:
#find the top 10 airlines reviewed. all 41,396 reviews contain the name of the airline.
top_10_airlines_vc=airline['airline_name'].value_counts()[:10]
top_10_airlines=list(top_10_airlines_vc.index.values)
#count number of times top 10 airlines occur in content2
#just value counts for top 10
top_10_airlines_vc

spirit-airlines      990
british-airways      901
united-airlines      840
jet-airways          727
air-canada-rouge     715
emirates             691
ryanair              658
american-airlines    612
lufthansa            600
qantas-airways       580
Name: airline_name, dtype: int64

In [8]:
#lift of top 10 brands occurring together
def lift(a, b):
    num = len(airline)
    num_a = len(airline[airline['content2'].str.contains(a)])
    num_b = len(airline[airline['content2'].str.contains(b)])
    if_has_a = airline[airline['content2'].str.contains(a)]
    num_a_and_b = len(if_has_a['content2'][if_has_a['content2'].str.contains(b)])
    return num*float(num_a_and_b)/(num_a * num_b)

In [9]:
lift_df = pd.DataFrame(columns=top_10_airlines, index=top_10_airlines)

for brand1, series in list(lift_df.iterrows()):
    for brand2 in series.index:
        if brand1 != brand2:
            lift_df[brand1].loc[brand2] = lift(brand2, brand1)

lift_df
#here, we see that it is not common to compare airlines in reviews. 
#therefore, when dealing with sentiment, we do not need to worry about proximity

Unnamed: 0,spirit-airlines,british-airways,united-airlines,jet-airways,air-canada-rouge,emirates,ryanair,american-airlines,lufthansa,qantas-airways
spirit-airlines,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
british-airways,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
united-airlines,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0
jet-airways,0.0,0.0,0.0,,0.0,0.0813441,0.0,0.0,0.0,0.0
air-canada-rouge,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0
emirates,0.0,0.0,0.0,0.0813441,0.0,,0.0,0.0,0.0,0.203921
ryanair,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0
american-airlines,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0
lufthansa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0
qantas-airways,0.0,0.0,0.0,0.0,0.0,0.203921,0.0,0.0,0.0,


In [10]:
#remove stopwords and make all words lower case

stop = stopwords.words('english')
airline_data['content_clean'] = airline_data['content'].str.lower()
airline_data['content_clean'] = airline_data['content_clean'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
airline_data[:1]

Unnamed: 0,airline_name,link,title,author,author_country,date,content,aircraft,type_traveller,cabin_flown,...,overall_rating,seat_comfort_rating,cabin_staff_rating,food_beverages_rating,inflight_entertainment_rating,ground_service_rating,wifi_connectivity_rating,value_money_rating,recommended,content_clean
0,adria-airways,/airline-reviews/adria-airways,Adria Airways customer review,D Ito,Germany,2015-04-10,Outbound flight FRA/PRN A319. 2 hours 10 min f...,,,Economy,...,7.0,4.0,4.0,4.0,0.0,,,4.0,1,outbound flight fra/prn a319. 2 hours 10 min f...


In [11]:
small=airline_data[:100]

# Function to Find the top 100 words in a Set of Reviews

In [12]:
def content_clean_words(df):
    """
    tokenizes the words in the review
    returns a list of words
    """
    content_list=df.content_clean.values.tolist()
    words=[]
    for i in range(0,len(content_list)):
        words1=word_tokenize(content_list[i])
        words.append(words1)
    return words

In [13]:
def combine_content_words(words):
    """
    combines list of words into one single list
    returns a list of words
    """
    content_words = [item for sublist in words for item in sublist]
    return content_words

In [14]:
def lemmatizer(content_words):
    """
    removes the ending of words in a list
    returns a list of words with the lemmas removed
    """
    content_words_no_punc = [w for w in content_words if w.isalpha()]
    porter=nltk.PorterStemmer()
    filtered_words= [porter.stem(t) for t in content_words_no_punc] #lemmatize
    return filtered_words

In [15]:
def top_words(filtered_words):
    """
    finds the frequency of all of the words in a list
    returns a list of top 100 words by how often they appear in the list
    """
    freqdist = FreqDist(filtered_words)
    top_100=freqdist.most_common(100)
    return top_100

In [16]:
def super_function(df):
    """
    takes in a the airline dataframe and 
    calls content_clean_words, combine_content_words, lemmatizer, and top_words 
    returns a list of words and their frequency
    """
    top=top_words(lemmatizer(combine_content_words(content_clean_words(df))))
    return top

# What Do Reviewers From The Most Represented Countries Care About The Most?
Relative to other top countries

In [17]:
#finds the top nationalities that wrote reviews 
#39805 out of the 41396 reviews have a nationality represented, or about 96% of the reviews.
top_5_origin=airline_data['author_country'].value_counts()[:5]
top_5_origin

United Kingdom    9969
United States     8507
Australia         5062
Canada            3303
Germany           1117
Name: author_country, dtype: int64

In [18]:

def top_words_to_list(a):
    """
    takes in a list of lists of words
    returns a single combined list of words
    """
    top_words_list=[]
    for tup in range(len(a)):
        word=a[tup][0]
        top_words_list.append(word)
    return top_words_list

In [19]:
def top_word_counts(a):
    """
    takes in a list of word frequency
    returns a list of each words frequency
    """    
    count_words_list=[]
    for tup in range(len(a)):
        word=a[tup][1]
        count_words_list.append(word)
    return count_words_list

In [20]:
# create seperate dataframes for each of the top 5 countries.
df_uk=airline_data[airline_data['author_country']=='United Kingdom']
df_us=airline_data[airline_data['author_country']=='United States']
df_aus=airline_data[airline_data['author_country']=='Australia']
df_can=airline_data[airline_data['author_country']=='Canada']
df_ger=airline_data[airline_data['author_country']=='Germany']
#find the top 100 words for each of the top 5 countries
top_uk_words=super_function(df_uk)
top_us_words=super_function(df_us)
top_aus_words=super_function(df_aus)
top_can_words=super_function(df_can)
top_ger_words=super_function(df_ger)


In [21]:
#normalize the frequency of the top 100 words for each country
#divide each word by the total number of words in each country's reviews (len(content_words))
#build a df with rows=words, col=countries
content_words_us=len(combine_content_words(content_clean_words(df_us)))
content_words_uk=len(combine_content_words(content_clean_words(df_uk)))
content_words_aus=len(combine_content_words(content_clean_words(df_aus)))
content_words_can=len(combine_content_words(content_clean_words(df_can)))
content_words_ger=len(combine_content_words(content_clean_words(df_ger)))
#create lists of the top 100 words for each of the top 5 countries
list_uk_words = top_words_to_list(top_uk_words)
list_us_words = top_words_to_list(top_us_words)
list_aus_words = top_words_to_list(top_aus_words)
list_can_words = top_words_to_list(top_can_words)
list_ger_words = top_words_to_list(top_ger_words)
#create lists of the frequency of the top 100 words for each of the top 5 countries
list_uk_counts = top_word_counts(top_uk_words)
list_us_counts = top_word_counts(top_us_words)
list_aus_counts = top_word_counts(top_aus_words)
list_can_counts = top_word_counts(top_can_words)
list_ger_counts = top_word_counts(top_ger_words)
#put everything into pretty dataframes
df_uk_ = pd.DataFrame(index=list_uk_words,columns=['UK Word Counts'])
df_uk_['UK Word Counts']=list_uk_counts
df_us_ = pd.DataFrame(index=list_us_words,columns=['US Word Counts'])
df_us_['US Word Counts']=list_us_counts
df_aus_ = pd.DataFrame(index=list_aus_words,columns=['AUS Word Counts'])
df_aus_['AUS Word Counts']=list_aus_counts
df_can_ = pd.DataFrame(index=list_can_words,columns=['CAN Word Counts'])
df_can_['CAN Word Counts']=list_can_counts
df_ger_ = pd.DataFrame(index=list_ger_words,columns=['GER Word Counts'])
df_ger_['GER Word Counts']=list_ger_counts

In [22]:
#combines the normalized frequency of words by each country into a single dataframe
df2=df_uk_.join(df_us_,how='outer')
df3=df2.join(df_aus_, how='outer')
df4=df3.join(df_can_,how='outer')
df_full=df4.join(df_ger_,how='outer')
df_full=df_full.fillna(0)
df_full['US Word Counts']=df_full['US Word Counts']/content_words_us
df_full['UK Word Counts']=df_full['UK Word Counts']/content_words_uk
df_full['AUS Word Counts']=df_full['AUS Word Counts']/content_words_aus
df_full['CAN Word Counts']=df_full['CAN Word Counts']/content_words_can
df_full['GER Word Counts']=df_full['GER Word Counts']/content_words_ger
df_full

Unnamed: 0,UK Word Counts,US Word Counts,AUS Word Counts,CAN Word Counts,GER Word Counts
ac,0.000000,0.000000,0.000000,0.002348,0.000000
again,0.002053,0.002245,0.002084,0.002311,0.001536
air,0.002214,0.002071,0.002558,0.006748,0.002503
aircraft,0.003094,0.000000,0.002755,0.001636,0.002636
airlin,0.005825,0.008038,0.006628,0.006179,0.005457
airport,0.003196,0.003831,0.003003,0.003614,0.003258
also,0.001733,0.001680,0.001876,0.001502,0.002199
alway,0.000000,0.000000,0.000000,0.000000,0.001603
anoth,0.000000,0.002358,0.000000,0.001884,0.000000
arriv,0.003401,0.003509,0.003514,0.003460,0.003046


The fact that the dataframe has 158 words and not 400 words means that 
reviewers use similar language to describe their experience. Out of the 5 countries, we can see what factors or attributes of a flight each country cares about the most. 

In [23]:
#find what country mentions each word the most.
top_us_word_freq = df_full['US Word Counts'].where(df_full.T.idxmax().values == 'US Word Counts').dropna().sort_values(ascending = False)
top_uk_word_freq = df_full['UK Word Counts'].where(df_full.T.idxmax().values == 'UK Word Counts').dropna().sort_values(ascending = False)
top_aus_word_freq = df_full['AUS Word Counts'].where(df_full.T.idxmax().values == 'AUS Word Counts').dropna().sort_values(ascending = False)
top_can_word_freq = df_full['CAN Word Counts'].where(df_full.T.idxmax().values == 'CAN Word Counts').dropna().sort_values(ascending = False)
top_ger_word_freq = df_full['GER Word Counts'].where(df_full.T.idxmax().values == 'GER Word Counts').dropna().sort_values(ascending = False)

In [24]:
top_us_word_freq
top_uk_word_freq
top_aus_word_freq
top_can_word_freq
top_ger_word_freq

seat         0.014133
servic       0.009921
good         0.009060
time         0.009020
crew         0.006888
class        0.005881
friendli     0.005232
one          0.004676
drink        0.004623
economi      0.004450
new          0.003576
nice         0.003470
even         0.003470
offer        0.003378
serv         0.003351
like         0.002596
old          0.002543
better       0.002305
also         0.002199
lh           0.002185
clean        0.002185
lufthansa    0.002185
choic        0.002172
well         0.002172
quit         0.002119
long         0.002027
small        0.002000
frankfurt    0.001987
free         0.001960
due          0.001934
realli       0.001907
price        0.001868
qualiti      0.001815
full         0.001815
space        0.001801
ife          0.001775
ok           0.001762
berlin       0.001669
snack        0.001669
work         0.001669
alway        0.001603
short        0.001576
inform       0.001563
ground       0.001550
system       0.001550
row       

#### Words that stand out for the each country:
US - delay, board, bag, gate, connect, ticket, cancel, late <br>
(Not shockingly, Americans seem to have the least expansive vocabulary out of the top 5 countries. They also have the most usage of the most common words. And Americans seem to be most focus on the logistics of air travel) <br>
UK - cabin, leg <br>
AUS - food, meal, comfort, entertain  <br>
(australians are hedonist and also probably because they fly so much focus on the experience) <br>
CAN - luggage, movie <br>
(most of the words are canadian related lol. would be interested to see if canadians leave the most positive reviews) <br>
GER - service, crew, friendly, drink, serv, drink, economic, clean snack <br>
(Germans are mostly interested in service)

# Do passengers in Different Parts of the Cabin Care about Different Things?

In [25]:
#Out of the 41396 reviewers, 38520 indicated what cabin type they flew in
cabine_type_counts=airline_data['cabin_flown'].value_counts()
cabine_type_counts
#77% of reviewers flew economy

Economy            29784
Business Class      6347
Premium Economy     1510
First Class          879
Name: cabin_flown, dtype: int64

In [26]:
# create seperate dataframes for each of cabin types
df_econ=airline_data[airline_data['cabin_flown']=='Economy']
df_biz=airline_data[airline_data['cabin_flown']=='Business Class']
df_premecon=airline_data[airline_data['cabin_flown']=='Premium Economy']
df_fc=airline_data[airline_data['cabin_flown']=='First Class']


In [29]:
#find the top 100 words for each of the cabin types
top_econ_words=super_function(df_econ)
top_biz_words=super_function(df_biz)
top_premecon_words=super_function(df_premecon)
top_fc_words=super_function(df_fc)


In [30]:
#normalize the frequency of the top 100 words for each country
#divide each word by the total number of words in each country's reviews (len(content_words))
#build a df with rows=words, col=countries
content_words_econ=len(combine_content_words(content_clean_words(df_econ)))
content_words_biz=len(combine_content_words(content_clean_words(df_biz)))
content_words_premecon=len(combine_content_words(content_clean_words(df_premecon)))
content_words_fc=len(combine_content_words(content_clean_words(df_fc)))
#create lists of the top 100 words for each of the top 5 countries
list_econ_words = top_words_to_list(top_econ_words)
list_biz_words = top_words_to_list(top_biz_words)
list_premecon_words = top_words_to_list(top_premecon_words)
list_fc_words = top_words_to_list(top_fc_words)
#create lists of the frequency of the top 100 words for each of the top 5 countries
list_econ_counts = top_word_counts(top_econ_words)
list_biz_counts = top_word_counts(top_biz_words)
list_preemecon_counts = top_word_counts(top_premecon_words)
list_fc_counts = top_word_counts(top_fc_words)
#put everything into pretty dataframes
df_econ_ = pd.DataFrame(index=list_econ_words,columns=['Ecomony Word Counts'])
df_econ_['Ecomony Word Counts']=list_econ_counts
df_biz_ = pd.DataFrame(index=list_biz_words,columns=['Business Class Word Counts'])
df_biz_['Business Class Word Counts']=list_biz_counts
df_premecon_ = pd.DataFrame(index=list_premecon_words,columns=['Premium Economy Word Counts'])
df_premecon_['Premium Economy Word Counts']=list_preemecon_counts
df_fc_ = pd.DataFrame(index=list_fc_words,columns=['First Class Word Counts'])
df_fc_['First Class Word Counts']=list_fc_counts


In [32]:
#combines the normalized frequency of words by each cabin type into a single dataframe
df_temp1=df_biz_.join(df_econ_,how='outer')
df_temp2=df_temp1.join(df_premecon_, how='outer')
df_full_temp=df_temp2.join(df_fc_,how='outer')
df_full_temp=df_full_temp.fillna(0)
df_full_temp['Ecomony Word Counts']=df_full_temp['Ecomony Word Counts']/content_words_econ
df_full_temp['Business Class Word Counts']=df_full_temp['Business Class Word Counts']/content_words_biz
df_full_temp['Premium Economy Word Counts']=df_full_temp['Premium Economy Word Counts']/content_words_premecon
df_full_temp['First Class Word Counts']=df_full_temp['First Class Word Counts']/content_words_fc
top_econ_word_freq = df_full_temp['Ecomony Word Counts'].where(df_full_temp.T.idxmax().values == 'Ecomony Word Counts').dropna().sort_values(ascending = False)
top_biz_word_freq = df_full_temp['Business Class Word Counts'].where(df_full_temp.T.idxmax().values == 'Business Class Word Counts').dropna().sort_values(ascending = False)
top_premecon_word_freq = df_full_temp['Premium Economy Word Counts'].where(df_full_temp.T.idxmax().values == 'Premium Economy Word Counts').dropna().sort_values(ascending = False)
top_fc_word_freq = df_full_temp['First Class Word Counts'].where(df_full_temp.T.idxmax().values == 'First Class Word Counts').dropna().sort_values(ascending = False)



In [37]:
top_econ_word_freq
top_biz_word_freq
top_premecon_word_freq
top_fc_word_freq

first       0.019308
class       0.017340
servic      0.010984
great       0.003657
experi      0.003112
attend      0.002899
trip        0.002247
also        0.002221
like        0.002167
gate        0.002021
two         0.001822
unit        0.001769
need        0.001755
ba          0.001675
last        0.001662
aa          0.001622
best        0.001609
product     0.001609
although    0.001423
fa          0.001410
expect      0.001410
though      0.001343
look        0.001330
Name: First Class Word Counts, dtype: float64

**COSINE SIMILARITY- SPIRIT AIRLINES and BRITISH AIRWAYS**

In [58]:
df_spirit=airline_data[airline_data['airline_name']=='spirit-airlines']
df_spirit=df_spirit[['airline_name','content']] #limit to 15 rows to test


In [39]:
#list of texts for the content
text_list_spirit=list(df_spirit['content'])
text_list_spirit[0]

u"I was curious and nervous to try this airline due to other reviews I had read online but glad I did. While Spirit Airlines doesn't offer the inflight amenities that other airlines do, my flight was excellent. The flight boarded on time, went smoothly and arrived at my destination on top. Since I was tired, I appreciated the opportunity to rest and not be bothered. Others complain about the stringent rules Spirit Airlines has regarding carry-on luggage. My advice either pack light or follow the airline rules."

In [40]:


WORD = re.compile(r'\w+')

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x]**2 for x in vec1.keys()])
    sum2 = sum([vec2[x]**2 for x in vec2.keys()])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator

def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

cosine_total=0
count=len(text_list_spirit)
for text in text_list_spirit:
    text1 = "Ultra low fares Easy online booking and check-in Reliable, on-time service Clean, fuel efficient airplanes \
    Friendly staff Deluxe leather seating One personal item that fits under the seat Largest ULCC network in the U.S., Latin America and Caribbean\
    committed to offering the lowest total price to the places we fly, on average much lower than other airlines.\
    We help people save money and travel more often, create new jobs and stimulate business growth in the communities we serve"
    text2 = text

    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine = get_cosine(vector1, vector2)
    cosine_total+=cosine
cosine_avg_spirit=float(cosine_total)/count    

print 'Cosine:', cosine_avg_spirit

Cosine: 0.339100343538


In [41]:
df_british=airline_data[airline_data['airline_name']=='british-airways']
df_british=df_british[['airline_name','content']]
#list of texts for the content
text_list_british=list(df_british['content'])

In [42]:
cosine_total=0
count=len(text_list_british)
for text in text_list_british:
    text1 = "Our customers will recognise that the service we offer is worth paying that little bit more \
    world’s leading global premium airline \
    forchoice for longhaul premium customers Deliver an outstanding service for customers \
    Grow our presence in key global cities leading position in London offer customers great value \
    bringing people together"
    text2 = text

    vector1 = text_to_vector(text1)
    vector2 = text_to_vector(text2)

    cosine = get_cosine(vector1, vector2)
    cosine_total+=cosine
cosine_avg_british=float(cosine_total)/count    

print 'Cosine:', cosine_avg_british

Cosine: 0.124313185049


**SENTIMENT ANALYSIS WORK**

In [91]:
#att dictionary
attr_dict = {'delayed':'scheduling','on time':'scheduling','late':'scheduling','early':'scheduling',
            'food':'service','flight attendant':'service','beverage':'service','drink':'service','breakfast':'service','lunch':'service','dinner':'service','meal':'service',
            'available':'booking','cancel':'booking','refund':'booking','ticket':'booking','customer service':'booking',
            'expensive':'price','cheap':'price','cost':'price','value':'price',
            'seat':'comfort','leg room':'comfort',
            'movie':'entertainment','tv':'entertainment','wifi':'entertainment','gogo':'entertainment'}
list_imp_words=attr_dict.keys()
for word in ['scheduling','service','booking','price','comfort','entertainment']:
    list_imp_words.append(word)

In [92]:
def build_review_list(airline_df):
    list_of_review_words=content_clean_words(airline_df) #list of list of words
    #remove punctuation later
    return list_of_review_words

In [93]:
def sentiment_of_nearby_words(list_of_review_words,list_imp_words): 
    word_sentiments=pd.DataFrame(columns=['imp words','nearby words', 'sent score'])
    nearby_words=[]
    df_words=[]
    for review in list_of_review_words:
        for word in list_imp_words:

            for i in range(len(review)):
                if word == review[i]:
                    if i==0:
                        nearby_word_list = review[i] + ' ' + review[i+1] + ' ' + review[i+2]
                        nearby_words.append(nearby_word_list)
                        df_words.append(word)
                    elif i==1:
                        nearby_word_list = review[i-1] + ' ' + review[i] + ' ' + review[i+1] + ' ' + review[i+2]
                        nearby_words.append(nearby_word_list)
                        df_words.append(word)
                    elif i==(len(review)-2):
                        nearby_word_list = review[i-2] + ' ' + review[i-1] + ' ' + review[i] + ' ' + review[i+1]
                        nearby_words.append(nearby_word_list)
                        df_words.append(word)
                    elif i==(len(review)-1):
                        nearby_word_list = review[i-2] + ' ' + review[i-1] + ' ' + review[i] 
                        nearby_words.append(nearby_word_list)
                        df_words.append(word)
                    else:
                        nearby_word_list = review[i-2] + ' ' + review[i-1] + ' ' + review[i] + ' ' + review[i+1] + ' ' + review[i+2] 
                        nearby_words.append(nearby_word_list)
                        df_words.append(word)
    word_sentiments['imp words']=df_words
    word_sentiments['nearby words']=nearby_words
    return word_sentiments

In [94]:
sid = SentimentIntensityAnalyzer()
def sent_int_analyze(nearby_words_df):
    for i in range(len(nearby_words_df)):

        temp_line = nearby_words_df['nearby words'][i]

        ss = SentimentIntensityAnalyzer().polarity_scores(temp_line)

        nearby_words_df['sent score'][i] = ss['compound']
    return word_sentiments

In [95]:
attr_dict = {'delayed':'scheduling','on time':'scheduling','late':'scheduling','early':'scheduling',
            'food':'service','flight attendant':'service','beverage':'service','drink':'service','breakfast':'service','lunch':'service','dinner':'service','meal':'service',
            'available':'booking','cancel':'booking','refund':'booking','ticket':'booking','customer service':'booking',
            'expensive':'price','cheap':'price','cost':'price','value':'price',
            'seat':'comfort','leg room':'comfort',
            'movie':'entertainment','tv':'entertainment','wifi':'entertainment','gogo':'entertainment'}

In [96]:
def replace_imp_words(s):
    attr=[]
    for word in s:
        if word in attr_dict:
            attr.append(attr_dict[word])
        else:
            attr.append(word)
    return attr

In [97]:
def find_avg_sent_score(word_sent_df):
    word_sent_df['attributes']=replace_imp_words(word_sent_df['imp words'])
    word_sent_df['sent score']=word_sent_df['sent score'].astype(float)
    return word_sent_df.groupby('attributes')[['sent score']].mean()

In [98]:
def super_sentiment_function(individual_airline_df):
    a = build_review_list(individual_airline_df)
    b = sentiment_of_nearby_words(a,list_imp_words)
    c = sent_int_analyze(b)
    d = find_avg_sent_score(c)
    return d

Unnamed: 0_level_0,sent score
attributes,Unnamed: 1_level_1
booking,0.005273
comfort,0.043779
entertainment,0.264377
price,0.043566
scheduling,-0.138305
service,-0.039023


In [152]:
df_acr = airline_data[airline_data['airline_name']=='air-canada-rouge']
sent_acr = super_sentiment_function(df_acr)
sent_acr

Unnamed: 0_level_0,sent score
attributes,Unnamed: 1_level_1
booking,0.005273
comfort,0.043779
entertainment,0.264377
price,0.043566
scheduling,-0.138305
service,-0.039023


In [154]:
df_acr

Unnamed: 0,airline_name,link,title,author,author_country,date,content,aircraft,type_traveller,cabin_flown,...,overall_rating,seat_comfort_rating,cabin_staff_rating,food_beverages_rating,inflight_entertainment_rating,ground_service_rating,wifi_connectivity_rating,value_money_rating,recommended,content_clean
1585,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Roger Duarte Neves,Canada,2015-08-01,"Flew to Lisbon from Toronto July 27 2015, plan...",Boeing 767-300,Couple Leisure,Economy,...,4.0,1.0,4.0,1.0,1.0,3.0,3.0,2.0,0,"flew lisbon toronto july 27 2015, plane older ..."
1592,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Mustapha Aissaoui,Canada,2015-07-31,We are a family of 5 who had booked a flight f...,Boeing 767-300,FamilyLeisure,Economy,...,2.0,1.0,4.0,2.0,1.0,3.0,1.0,1.0,0,family 5 booked flight montreal athens aegean ...
1593,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Colin Dinh,Canada,2015-07-27,I was hesitant to book this airline going to O...,,FamilyLeisure,Economy,...,8.0,3.0,5.0,5.0,3.0,4.0,,4.0,1,hesitant book airline going osaka due negative...
1600,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Fran Booth,Canada,2015-07-27,I checked our family of 6 in online before we ...,Boeing 767,FamilyLeisure,Economy,...,2.0,1.0,2.0,1.0,1.0,4.0,3.0,1.0,0,"checked family 6 online departed yvr. reason, ..."
1601,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Christopher Neep,Canada,2015-07-25,Air Canada rouge was a disgrace at least on th...,Airbus 319,Solo Leisure,Economy,...,1.0,1.0,1.0,1.0,,2.0,,2.0,0,air canada rouge disgrace least flight. aircra...
1608,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Joyce Zak,Canada,2015-07-24,"Fine going to Vancouver, but on return there w...",,Solo Leisure,Economy,...,3.0,3.0,2.0,,,1.0,,1.0,0,"fine going vancouver, return big delay! 3 hour..."
1609,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Natalia Tchernycheva,Canada,2015-07-24,Flying AC1753 from Varadero to Toronto (connec...,,Solo Leisure,Economy,...,1.0,2.0,2.0,1.0,,3.0,,1.0,0,flying ac1753 varadero toronto (connected fly)...
1610,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Chris Kenyon,Australia,2015-07-23,We were a bit apprehensive about flying with C...,Boeing 737,Couple Leisure,Economy,...,7.0,3.0,3.0,,,3.0,,4.0,1,bit apprehensive flying canada rouge given amo...
1620,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Tyler Christensen,United States,2015-07-22,It was a 12 hour flight from Vancouver to Osak...,,FamilyLeisure,Economy,...,1.0,1.0,2.0,2.0,1.0,2.0,1.0,1.0,0,12 hour flight vancouver osaka screen entertai...
1621,air-canada-rouge,/airline-reviews/air-canada-rouge,Air Canada rouge customer review,Chris Turcotte,Canada,2015-07-22,I would not recommend Air Canada Rouge to anyo...,,FamilyLeisure,Economy,...,1.0,3.0,5.0,,3.0,1.0,5.0,2.0,0,would recommend air canada rouge anyone. start...


In [159]:
a = build_review_list(df_acr)
b = sentiment_of_nearby_words(a,list_imp_words)
c = sent_int_analyze(b)
d = find_avg_sent_score(c)
d

Unnamed: 0_level_0,sent score
attributes,Unnamed: 1_level_1
booking,0.005273
comfort,0.043779
entertainment,0.264377
price,0.043566
scheduling,-0.138305
service,-0.039023


In [160]:
c

Unnamed: 0,imp words,nearby words,sent score,attributes
0,seat,"want random seat , drinks",0.0772,comfort
1,cost,trying low cost budget airline,-0.2732,price
2,cost,. combine cost fact cancel,-0.2500,price
3,cancel,cost fact cancel one connecting,-0.2500,booking
4,expensive,", makes expensive airline is",0.0000,price
5,late,"whole day late , top",0.2023,scheduling
6,seat,bag . seat requests anywhere,0.0000,comfort
7,ticket,hour wait ticket counter causing,0.0000,booking
8,ticket,hour wait ticket counter got,0.0000,booking
9,service,horrible customer service . first,-0.5423,service


In [100]:
df_spirit=airline_data[airline_data['airline_name']=='spirit-airlines']
df_ba=airline_data[airline_data['airline_name']=='british-airways']
df_ua=airline_data[airline_data['airline_name']=='united-airlines']
df_ja=airline_data[airline_data['airline_name']=='jet-airways']
df_acr=airline_data[airline_data['airline_name']=='air-canada-rouge']
df_ra=airline_data[airline_data['airline_name']=='ryanair']
df_aa=airline_data[airline_data['airline_name']=='american-airlines']
df_luf=airline_data[airline_data['airline_name']=='lufthansa']
df_qa=airline_data[airline_data['airline_name']=='qantas-airways']

In [101]:
# sent_spirit = super_sentiment_function(df_spirit)
# sent_ba = super_sentiment_function(df_ba)
# sent_ua = super_sentiment_function(df_ua)
# sent_ja = super_sentiment_function(df_ja)
# sent_acr = super_sentiment_function(df_acr)
# sent_ra = super_sentiment_function(df_ra)
# sent_aa = super_sentiment_function(df_aa)
# sent_luf = super_sentiment_function(df_luf)
sent_qa = super_sentiment_function(df_qa)


In [150]:
sent_qa

Unnamed: 0_level_0,sent score
attributes,Unnamed: 1_level_1
booking,0.005273
comfort,0.043779
entertainment,0.264377
price,0.043566
scheduling,-0.138305
service,-0.039023


In [149]:
# s1=sent_spirit.merge(sent_ba,left_index = True, right_index = True,suffixes=('_spirit', '_british'))
# s2 = sq.merge(sent_ua)
# df2=df_uk_.join(df_us_,how='outer')

sent_scores_df =sent_spirit.merge(sent_ba,right_index=True, left_index=True)\
.merge(sent_ua,right_index=True, left_index=True).merge(sent_ja,right_index=True, left_index=True)\
.merge(sent_acr,right_index=True, left_index=True).merge(sent_ra,right_index=True, left_index=True)\
.merge(sent_aa,right_index=True, left_index=True).merge(sent_luf,right_index=True, left_index=True)\
.merge(sent_qa,right_index=True, left_index=True)
# sent_scores_df.columns
sent_scores_df.columns = [strip_non_ascii(x) for x in sent_scores_df.columns]
# sent_scores_df.columns['spirit_airlines_score','british_airways_score','united_airlines_score','jet_airways_score','air_canada-rouge_score','emirates_score','ryanair_score','american_airlines_score','lufthansa_score','qantas_airways_score']
sent_scores_df

Unnamed: 0_level_0,sent score_x,sent score_y,sent score_x,sent score_y,sent score_x,sent score_y,sent score_x,sent score_y,sent score
attributes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
booking,0.005273,0.005273,0.005273,0.005273,0.005273,0.005273,0.005273,0.005273,0.005273
comfort,0.043779,0.043779,0.043779,0.043779,0.043779,0.043779,0.043779,0.043779,0.043779
entertainment,0.264377,0.264377,0.264377,0.264377,0.264377,0.264377,0.264377,0.264377,0.264377
price,0.043566,0.043566,0.043566,0.043566,0.043566,0.043566,0.043566,0.043566,0.043566
scheduling,-0.138305,-0.138305,-0.138305,-0.138305,-0.138305,-0.138305,-0.138305,-0.138305,-0.138305
service,-0.039023,-0.039023,-0.039023,-0.039023,-0.039023,-0.039023,-0.039023,-0.039023,-0.039023
