In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [3]:
df = pd.read_csv('data/BA_reviews_topics.csv', index_col=0)

In [4]:
topics = ['positive','budget_airlines',	'customer_service',	'other_negative',	'bags_crew_seats',	'business_class']

def topic_split(df, verbose=0):
    # filter by topic
    sub_dfs = {}
    sub_strings = {}
    for topic in topics:
        sub_dfs[topic] = df[df[topic]>0.4]  
    
    for topic in topics:
        # print counts
        if verbose >0:
            print(f'Number for {topic}: ', sub_dfs[topic].shape[0])
        # convert to strings
        sub_strings[topic] = sub_dfs[topic].reviews.sum()
    return sub_strings

In [5]:
def docs_to_tfidf_matrix(docs, doc_names=topics):
    # create vectorizer
    vectorizer = TfidfVectorizer(ngram_range = (1,4), max_df = 0.8, min_df=0.01)
    # create tfidf matrix and put into dataframe
    tfidf_matrix = vectorizer.fit_transform(docs)
    tfidf_matrix = tfidf_matrix.todense()
    tfidf_df = pd.DataFrame(tfidf_matrix, columns=vectorizer.get_feature_names_out())
    tfidf_df.index = doc_names
    return tfidf_df.transpose()

def create_wordcloud(tfidf_matrix, doc_name, words_to_drop, show=True, save=False, filename='wordclouds/my_wordcloud.png'):
    # drop words
    words_to_drop = [word for word in words_to_drop if word in tfidf_matrix.index]
    words = tfidf_matrix[doc_name].drop(words_to_drop)
    # create wordcloud
    my_wordcloud = WordCloud(background_color="white", 
        width=3000, 
        height=2000, 
        colormap="copper",
        max_words=300
        # color_func= lambda *args, **kwargs: (0,0,0)
    )\
    .generate_from_frequencies(words)
    # display
    if show:
        plt.figure(figsize=[15,10])
        plt.imshow(my_wordcloud)
        plt.axis("off")
        plt.show()
    if save:
        my_wordcloud.to_file(filename)

In [6]:
topic_strings = topic_split(df, verbose=1)

topic_tfidf = docs_to_tfidf_matrix(topic_strings.values())

Number for positive:  1000
Number for budget_airlines:  338
Number for customer_service:  651
Number for other_negative:  823
Number for bags_crew_seats:  190
Number for business_class:  560


In [7]:
topic_tfidf.sort_values('business_class', ascending=False).head(30)

Unnamed: 0,positive,budget_airlines,customer_service,other_negative,bags_crew_seats,business_class
flat,0.026822,0.0,0.0,0.019723,0.011292,0.078147
ba club,0.017881,0.003613,0.0,0.003945,0.0,0.043281
fas,0.012914,0.003613,0.0,0.005917,0.0,0.042079
class lounge,0.007947,0.001807,0.004985,0.0,0.0,0.040877
flat bed,0.009934,0.0,0.0,0.003945,0.004517,0.039675
privacy,0.025828,0.0,0.002991,0.002958,0.0,0.039675
club class,0.006954,0.0,0.010968,0.001972,0.0,0.038472
fa,0.031789,0.00542,0.0,0.026626,0.0,0.038472
climb over,0.005796,0.0,0.0,0.011508,0.0,0.036478
new club,0.010927,0.0,0.000997,0.005917,0.0,0.034866


In [10]:
drop_words = ['did','getting','quite','including','went','fairly','lot','worth',
    'airlines','possible','ve','san','come','thing','plus','make','probably','20','got','say','used',
    '30','ago','ll','end',
    'thank','couldn','don','wasn','tried fly','come','hong','10',
    '12','24','does','didn','10','pm','50',
    'say','san','look','wasn','kg','20','15','10',
    'ife','low','90','25','air','45','ve','check',
    'airline','heathrow',
    'traveller plus','world traveller plus','british airways premium', 'airways premium',
    'ba long haul','january','february','august','2020','customer service','ba customer service','relations',
    'customer relations','ba club','fas','fa','flat','class lounge','ba club world','new club','club class','yang',
    'club europe passengers','europe passengers', 'and were', 'that she', 'in opened', 'row of',
    'it the', 'upper','facing','business class seats','class seat','1st','lounge was','1st class',
    'in club europe','the club europe','class seat','the new club','of schedule',
    'the phone','world traveller', 'breakfast was', 'economy and', 'was served', 'haul flights',
    'screen was', 'cape', 've had',
    'buy on','for food','on short','quick and','with low cost','snack and',
    'lounge is','ba first','world seat','airways business class','ba business class','business class on',
    'europe seats','the club world','cancelled and','customer service is','entertainment was',
    'haul flight','cup of','ife was','pass was','slow and','the low','worth the', 'she was', 'and we were',
    'the british airways','was about','wish to','that he','and she','and we were','even though we',
    'food or','with low','they told','asked me','the premium economy','for premium economy',
    'small and', 'narrow and','was good and','good as','and well','the ife was','as good','landed on'
    ]

In [11]:

for topic in topics:
    create_wordcloud(topic_tfidf, topic, words_to_drop=drop_words, 
        save=True, show=False, 
        filename=f'wordclouds/{topic}.png'
    )