In [1]:
import pandas as pd  # A

tweet_df = pd.read_csv('../data/cleaned_airline_tweets.csv')  # A

tweet_df.head()  # A

# A import our tweets from chapter 5

Unnamed: 0,text,sentiment
0,@VirginAmerica What @dhepburn said.,neutral
1,"@VirginAmerica it was amazing, and arrived an ...",positive
2,@VirginAmerica I &lt;3 pretty graphics. so muc...,positive
3,@VirginAmerica So excited for my first cross c...,positive
4,I ❤️ flying @VirginAmerica. ☺️👍,positive


In [2]:
import emoji  # A

english_emojis = emoji.UNICODE_EMOJI['en']  # A

def extract_emojis(s):  # B
    return [english_emojis[c] for c in s if c in english_emojis]

tweet_df['emojis'] = tweet_df['text'].map(lambda x: extract_emojis(x))  # B

tweet_df['num_emojis'] = tweet_df['emojis'].map(len)  # C

# A use a package called emoji. To install run `pip3 install emoji`
# B convert emojis to english words
# C Count the number of emojis used in the tweet

In [4]:
tweet_df['mention_count'] = tweet_df['text'].map(lambda x: x.count('@'))  # A

tweet_df['retweet'] = tweet_df['text'].map(lambda x: x.startswith('RT '))  # B

tweet_df[['text', 'sentiment', 'num_emojis', 'mention_count', 'retweet']].head()

# A Count the number of mentions in the tweet
# B Boolean whether or not the tweet is a retweet

Unnamed: 0,text,sentiment,num_emojis,mention_count,retweet
0,@VirginAmerica What @dhepburn said.,neutral,0,2,False
1,"@VirginAmerica it was amazing, and arrived an ...",positive,0,1,False
2,@VirginAmerica I &lt;3 pretty graphics. so muc...,positive,0,1,False
3,@VirginAmerica So excited for my first cross c...,positive,0,1,False
4,I ❤️ flying @VirginAmerica. ☺️👍,positive,3,1,False


In [46]:
import preprocessor as tweet_preprocessor  # A

# remove urls and mentions
tweet_preprocessor.set_options(  # A
    tweet_preprocessor.OPT.URL, tweet_preprocessor.OPT.NUMBER
)

def combine_text(row):  # B
    return f'tweet: {tweet_preprocessor.clean(row.text)}. mention_count: {row.mention_count}. emojis: {" ".join(row.emojis)}. retweet: {row.retweet}'


tweet_df['combined_text'] = tweet_df.apply(combine_text, axis=1)  # C

print(tweet_df.iloc[4]['combined_text'])

tweet_df.head()

# A Use the same tweet preprocessor we used in chapter 5
# B a function that takes in a row of data and creates a single piece of text with all of our features in them
# C Vectorize this feature-rich text instead of the original text


tweet: I ❤️ flying @VirginAmerica. ☺️👍. mention_count: 1. emojis: :red_heart: :smiling_face: :thumbs_up:. retweet: False


Unnamed: 0,text,sentiment,length,emojis,num_emojis,combined_text,mention_count,retweet
0,@VirginAmerica What @dhepburn said.,neutral,35,[],0,tweet: @VirginAmerica What @dhepburn said.. me...,2,False
1,"@VirginAmerica it was amazing, and arrived an ...",positive,80,[],0,"tweet: @VirginAmerica it was amazing, and arri...",1,False
2,@VirginAmerica I &lt;3 pretty graphics. so muc...,positive,83,[],0,tweet: @VirginAmerica I &lt;3 pretty graphics....,1,False
3,@VirginAmerica So excited for my first cross c...,positive,140,[],0,tweet: @VirginAmerica So excited for my first ...,1,False
4,I ❤️ flying @VirginAmerica. ☺️👍,positive,31,"[:red_heart:, :smiling_face:, :thumbs_up:]",3,tweet: I ❤️ flying @VirginAmerica. ☺️👍. mentio...,1,False


In [66]:
import pandas as pd  # A
from sentence_transformers import SentenceTransformer, util  # B


tweet_df = pd.read_csv('../data/cleaned_airline_tweets.csv')  # A

model = SentenceTransformer('all-MiniLM-L6-v2')  # B

embeddings = model.encode(tweet_df['text'], convert_to_tensor=True)  # B

# A import our tweets from chapter 5
# B embed our text using a vectorizer specifically fine tuned to recognize semantic similiarty

torch.Size([3860, 384])

In [67]:
from torch import where

cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)  # A
COSINE_THRESHOLD = 0.95

index1, index2 = where(cosine_scores >= COSINE_THRESHOLD)  # B
pairs_to_consider = list(zip(index1.tolist(), index2.tolist()))  # B
for i1, i2 in pairs_to_consider[:len(pairs_to_consider) // 2]:  # B
    if i1 != i2:  # C
        samples = tweet_df[tweet_df.index.isin([i1, i2])]  # C
        if samples['sentiment'].nunique() > 1:  # C
            print(i1, i2, samples[['text', 'sentiment']].values)  # C
            print('--------')
            
# A Calculate cosine similarity betwen all pairs of text
# B iterate over pairs of text that are higher than our threshold (cosine similiarty is max 1)
# C Print pairs of text that are semantically similar but have different labels


1458 1534 [['@SouthwestAir will do. Thank you.' 'neutral']
 ['@SouthwestAir thank you, will do' 'positive']]
--------
1534 1458 [['@SouthwestAir will do. Thank you.' 'neutral']
 ['@SouthwestAir thank you, will do' 'positive']]
--------
1864 1960 [["“@JetBlue: Our fleet's on fleek. http://t.co/clu5pdPrHP” :(" 'neutral']
 ["“@JetBlue: Our fleet's on fleek. http://t.co/3kVkd8yRxa” + lol wow"
  'positive']]
--------
1907 1960 [["“@JetBlue: Our fleet's on fleek. http://t.co/g97HAbyeP5”\n\nSMH"
  'neutral']
 ["“@JetBlue: Our fleet's on fleek. http://t.co/3kVkd8yRxa” + lol wow"
  'positive']]
--------
