In [1]:
# dependencies
import pandas as pd
import numpy as np
import nltk 
import random

In [2]:
# create the df
url = 'https://raw.githubusercontent.com/alexander-one/portfolio/main/Project%20-Tweet%20KNN/assets/nytimeshealth.txt'
tweets_df = pd.read_csv(url, on_bad_lines = 'skip')

Here we see a sample of the dataframe but as the tweet is store in its entirety on a single line, we barely get passed the date stamp. 

In [3]:
tweets_df.head()

Unnamed: 0,tweets
0,548662191340421120|Sat Dec 27 02:10:34 +0000 2...
1,548579831169163265|Fri Dec 26 20:43:18 +0000 2...
2,548579045269852161|Fri Dec 26 20:40:11 +0000 2...
3,548444679529041920|Fri Dec 26 11:46:15 +0000 2...
4,548311901227474944|Fri Dec 26 02:58:39 +0000 2...


Setting the max column width display to none will show the tweets in their entirety. 

We want to determine the similarity between tweets, which means that the ID, Date, and URL are useless to us, so we need a way to remove them. On top of that, the RT, any tags with @, and punctuation will also not be helpful for us. 

In [4]:
pd.set_option('display.max_colwidth', None)
tweets_df.head()

Unnamed: 0,tweets
0,548662191340421120|Sat Dec 27 02:10:34 +0000 2014|Risks in Using Social Media to Spot Signs of Mental Distress http://nyti.ms/1rqi9I1
1,548579831169163265|Fri Dec 26 20:43:18 +0000 2014|RT @paula_span: The most effective nationwide diabetes prevention program you've probably never heard of: http://newoldage.blogs.nytimes.com/2014/12/26/diabetes-prevention-that-works/
2,548579045269852161|Fri Dec 26 20:40:11 +0000 2014|The New Old Age Blog: Diabetes Prevention That Works http://nyti.ms/1xm7fTi
3,548444679529041920|Fri Dec 26 11:46:15 +0000 2014|Well: Comfort Casseroles for Winter Dinners http://nyti.ms/1xTNoO0
4,548311901227474944|Fri Dec 26 02:58:39 +0000 2014|High-Level Knowledge Before Veterans Affairs Scandal http://nyti.ms/13yCpvS


We'll now see a block of functions using the str.replace function of the pandas dataframe. Again, my goal with this project was to do a single action in each function, and this was my first area to stick to that goal. 

In [5]:
    def remove_at():
        df['tweets'] = df['tweets'].str.replace(r'@[a-zA-Z0-9_]*', '', regex = True)
        return

    def remove_retweet():
        df['tweets'] = df['tweets'].str.replace(r'(RT)', '', regex = True)
        return

    def remove_timestamp():
        df['tweets'] = df['tweets'].str.replace(r'\|.*\|', '', regex = True)
        return

    def remove_id():
        df['tweets'] = df['tweets'].str.replace(r'[0-9]{18}', '', regex = True)
        return

    def remove_url():
        df['tweets'] = df['tweets'].str.replace(r'http\S+', '', regex = True)
        return

    def remove_punctuation():
        df['tweets'] = df['tweets'].str.replace(r'[^\w\s]', '', regex = True)
        return

In [6]:
#place original dataframe in copy as a precaution
df = tweets_df

#call the functions to remove data
remove_at()
remove_retweet()
remove_timestamp()
remove_id()
remove_url()
remove_punctuation()

Below we see individual tweets with only words from the headlines.

In [7]:
df.head()

Unnamed: 0,tweets
0,Risks in Using Social Media to Spot Signs of Mental Distress
1,The most effective nationwide diabetes prevention program youve probably never heard of
2,The New Old Age Blog Diabetes Prevention That Works
3,Well Comfort Casseroles for Winter Dinners
4,HighLevel Knowledge Before Veterans Affairs Scandal


The next step is to tokenize each of the tweets. This will create a list in each of the rows. 

In [8]:
    def tokenize():
        df['tweets'] = df.apply(lambda row: nltk.word_tokenize(row['tweets']), axis=1)
        return

In [9]:
tokenize()

In [10]:
df.head()

Unnamed: 0,tweets
0,"[Risks, in, Using, Social, Media, to, Spot, Signs, of, Mental, Distress]"
1,"[The, most, effective, nationwide, diabetes, prevention, program, youve, probably, never, heard, of]"
2,"[The, New, Old, Age, Blog, Diabetes, Prevention, That, Works]"
3,"[Well, Comfort, Casseroles, for, Winter, Dinners]"
4,"[HighLevel, Knowledge, Before, Veterans, Affairs, Scandal]"


Now that we have the data cleaned and ready to go, it's time for the Jaccard distance calculations. 
Jaccard Distance = 1 - number of shared words / number of unique words. This means that if the tweets are very similar they will have a value closer to 0, and if they are completely dissimilar they will have a value of 1. 

To manage this, we have two functions: the first will calculate the Jaccard distance, the second will fill a matrix with those values. 

In [13]:
    def calc_jaccard_distance(A, B):
        #sum the number of unique words
        #1 - number of shared words/number of unique words

        list_of_words = []
        shared_words = 0
        for word in A:
            list_of_words.append(word)
            for token in B:
                list_of_words.append(token)
                if word == token:
                    shared_words += 1


        unique_words = len(set(list_of_words))

        if unique_words != 0:
            jaccard_distance_value = 1-(shared_words/unique_words)
        else: 
            return 1

        return jaccard_distance_value

    def fill_jaccard_matrix():

        #for each row in the tweets column, fill the tweets_similarity matrix 
        #with Jaccard distance for each other row
        tweet_similarity = np.ones((len(df), len(df)))
        
        for row in range(len(df['tweets'])):
            for other_rows in range(row,len(df['tweets'])):
                tweet_similarity[row][other_rows] = calc_jaccard_distance(df['tweets'][row], df['tweets'][other_rows])
                tweet_similarity[other_rows][row] = tweet_similarity[row][other_rows]

        return tweet_similarity
    
   

In [14]:
tweet_similarity = fill_jaccard_matrix()

In [15]:
print(tweet_similarity)

[[0.         0.95454545 1.         ... 1.         0.94117647 0.95238095]
 [0.95454545 0.         0.95       ... 1.         1.         0.95454545]
 [1.         0.95       0.         ... 1.         1.         0.94736842]
 ...
 [1.         1.         1.         ... 0.         1.         1.        ]
 [0.94117647 1.         1.         ... 1.         0.         1.        ]
 [0.95238095 0.95454545 0.94736842 ... 1.         1.         0.        ]]
