In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
tweets_info = pd.read_csv('lks_rtwts.csv')
tweets = list(tweets_info['text'])

In [3]:
tweets_info

Unnamed: 0,tweet_id,text,retweets,likes
0,205888692580126720,#radensaleh not myth learn life bring kids gal...,0,1
1,207430942028079104,new bmw 3 series awarded 5 stars euro ncap cra...,0,2
2,208204757779759105,bmw hand 200 electric vehicles olympics,0,1
3,208283774251831296,asked sauber info images split car said shows ...,1,3
4,208342777627549696,racky think im driving bmw something,0,0
...,...,...,...,...
450,209938458611941376,free black bmw choice curious visit,0,0
451,208968123179741184,1980 bmw r100rs cafe racer via,0,0
452,210234207379800064,agree money not buy happiness somehow comforta...,0,0
453,209609738441334784,bmw r100 7 4h10 paris via,0,0


In [4]:
def similarity_data(tweets, feature_extraction, tdm, tweets_tdm):
    """
    Go through all the tweets and create a map for each tweet and
    there cosine similarities
    """
    # a list of dictionaries containing list of related tweets and the
    # cosine value
    cosine_value_map = []
    for tweet in tweets:
        temp = {tweet:[]}
        query = feature_extraction.transform([tweet])
        cosine_similarities = linear_kernel(query, tdm).flatten()
        related_docs_indices = cosine_similarities.argsort()[:-4:-1]
        for index in related_docs_indices:
            if cosine_similarities[index] > 0.3:       # threshold value for cosine similarity
                temp[tweet].append((tweets_tdm[index], cosine_similarities[index], index))
        cosine_value_map.append(temp)
     
    # saving all the values to "data" in the form of table
    data = {'tweet_one':[], 'tweet_two':[], 'cosine_relation':[], 'index':[]}
    for item in cosine_value_map:
        for key in item.keys():
            for processed_data in item[key]:
                if key != processed_data[0]:
                    data['tweet_one'].append(key)
                    data['tweet_two'].append(processed_data[0])
                    data['cosine_relation'].append(processed_data[1])
                    data['index'].append(processed_data[2])
                
    return data

In [5]:
def avg_len(df):
    """
    to find average cosine similarity and number of similar tweets
    """
    avg = 0
    num = len(df.index)
    
    for i in range(num):
        avg = avg + df.loc[i]['cosine_relation']
        
    print("Average Cosine Similarity :" , avg/num)
    print("Number of similar tweets :", num)
    
    return avg/num, num

In [6]:
# list of similar tweets within a single entity tweets
feature_extraction = TfidfVectorizer(analyzer='word')
tdm = feature_extraction.fit_transform(tweets)

data = similarity_data(tweets, feature_extraction, tdm, tweets)

df = pd.DataFrame(data) 
df

Unnamed: 0,tweet_one,tweet_two,cosine_relation,index
0,#for sale oe replacement bmw driver side headl...,#on sale depo 344 1110l asd bmw 5 series drive...,0.492694,15
1,#for sale oe replacement bmw driver side headl...,#forsale tyc 20 6472 01 bmw driver side headli...,0.363815,8
2,#forsale tyc 20 6472 01 bmw driver side headli...,#for sale oe replacement bmw driver side headl...,0.363815,6
3,#forsale tyc 20 6472 01 bmw driver side headli...,#on sale depo 344 1110l asd bmw 5 series drive...,0.300724,15
4,bmw reveals new performance m3 m5 uk market #l...,bmw m3 limited edition uk #1,0.303762,12
...,...,...,...,...
157,dawn mac button siri comes bmw gm land rover a...,dawn mac button siri comes bmw gm land rover a...,0.854956,446
158,dawn mac button siri comes bmw gm land rover a...,bmw gm mercedes land rvoer jaguar audi toyota ...,0.382951,442
159,1980 bmw r100rs cafe racer via,bmw cafe racer,0.637374,243
160,agree money not buy happiness somehow comforta...,agree money not buy happiness somehow comforta...,0.929729,112


In [7]:
avg_len(df)

Average Cosine Similarity : 0.5048826980953106
Number of similar tweets : 162


(0.5048826980953106, 162)

In [8]:
tweets_info1 = pd.read_csv('mix_files/ptd_RL2013D01E001.csv')   # BMW
tweets_info2 = pd.read_csv('mix_files/ptd_RL2013D01E003.csv')   # Volvo
tweets_info3 = pd.read_csv('mix_files/ptd_RL2013D02E051.csv')   # Royal Bank of Scotland
tweets_info4 = pd.read_csv('mix_files/ptd_RL2013D02E054.csv')   # Barclays

In [9]:
tweets1 = list(tweets_info1['text'])
tweets2 = list(tweets_info2['text'])
tweets3 = list(tweets_info3['text'])
tweets4 = list(tweets_info4['text'])

len1 = len(tweets1)
len2 = len(tweets2)
len3 = len(tweets3)
len4 = len(tweets4)

In [10]:
# if comparing 1 and 2 i.e. BMW vs Volvo
if len1 > len2:
    tweets1 = tweets1[:len2]
else:
    tweets2 = tweets2[:len1]
    
feature_extraction = TfidfVectorizer(ngram_range=(1, 2))
tdm = feature_extraction.fit_transform(tweets1)
data1 = similarity_data(tweets2, feature_extraction, tdm, tweets1)

df1 = pd.DataFrame(data1)

cosine_avg1, num_twts1 = avg_len(df1)

Average Cosine Similarity : 0.4739998089307112
Number of similar tweets : 52


In [11]:
# if comparing 1 and 3 i.e. BMW vs Royal Bank of Scotland
if len1 > len3:
    tweets1 = tweets1[:len3]
else:
    tweets3 = tweets3[:len1]
    
feature_extraction = TfidfVectorizer(ngram_range=(1, 2))
tdm = feature_extraction.fit_transform(tweets1)
data2 = similarity_data(tweets3, feature_extraction, tdm, tweets1)

df2 = pd.DataFrame(data2)

cosine_avg2, num_twts2 = avg_len(df2)

Average Cosine Similarity : 0.3659376328361584
Number of similar tweets : 58


In [12]:
# if comparing 1 and 4 i.e. BMW vs Barclays
if len1 > len4:
    tweets1 = tweets1[:len4]
else:
    tweets4 = tweets4[:len1]
    
feature_extraction = TfidfVectorizer(ngram_range=(1, 2))
tdm = feature_extraction.fit_transform(tweets1)
data3 = similarity_data(tweets4, feature_extraction, tdm, tweets1)

df3 = pd.DataFrame(data3)

cosine_avg3, num_twts3 = avg_len(df3)

Average Cosine Similarity : 0.3688259538551511
Number of similar tweets : 58


In [13]:
# if comparing 2 and 3 i.e. Volvo vs Royal Bank of Scotland
if len2 > len3:
    tweets2 = tweets2[:len3]
else:
    tweets3 = tweets3[:len2]
    
feature_extraction = TfidfVectorizer(ngram_range=(1, 2))
tdm = feature_extraction.fit_transform(tweets2)
data4 = similarity_data(tweets3, feature_extraction, tdm, tweets2)

df4 = pd.DataFrame(data4)

cosine_avg4, num_twts4 = avg_len(df4)

Average Cosine Similarity : 0.3652195980010636
Number of similar tweets : 32


In [14]:
# if comparing 2 and 4 i.e. Volvo vs Barclays
if len2 > len4:
    tweets2 = tweets2[:len4]
else:
    tweets4 = tweets4[:len2]
    
feature_extraction = TfidfVectorizer(ngram_range=(1, 2))
tdm = feature_extraction.fit_transform(tweets2)
data5 = similarity_data(tweets4, feature_extraction, tdm, tweets2)

df5 = pd.DataFrame(data5)

cosine_avg5, num_twts5 = avg_len(df5)

Average Cosine Similarity : 0.37671368616043943
Number of similar tweets : 37


In [15]:
# if comparing 3 and 4 i.e. Royal Bank of Scotland vs Barclays
if len3 > len4:
    tweets3 = tweets3[:len4]
else:
    tweets4 = tweets4[:len3]
    
feature_extraction = TfidfVectorizer(ngram_range=(1, 2))
tdm = feature_extraction.fit_transform(tweets3)
data6 = similarity_data(tweets4, feature_extraction, tdm, tweets3)

df6 = pd.DataFrame(data6)

cosine_avg6, num_twts6 = avg_len(df6)

Average Cosine Similarity : 0.3858679160835583
Number of similar tweets : 22
