## Working with Language Processing Methods

In [1]:
import pandas as pd

In [15]:
tweet_data = pd.read_csv(r'./tweetCoords.csv',header=None)

In [17]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']

In [18]:
tweet_data.columns = columns

Comparison of converting time via the pandas `to_datetime()` method, but explicitly defining the format string.

In [23]:
%timeit tweet_data['timestamp'].apply(pd.to_datetime)

31.9 s ± 1.12 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
%timeit tweet_data['timestamp'].apply(pd.to_datetime,format='%Y-%m-%d %H:%M:%S')

37.9 s ± 210 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


using the faster of the two to convert to datetime.

In [37]:
tweet_data['timestamp'] = tweet_data['timestamp'].apply(pd.to_datetime)

In [40]:
time_string = "2017-09-01 00:00:00"
delta_hours = 1
start_time = pd.to_datetime(time_string)

Getting information in a single hour.

In [45]:
tweet_hour = tweet_data[(tweet_data['timestamp'] >= start_time) &
                        (tweet_data['timestamp'] <= start_time + pd.Timedelta(hours=delta_hours)) ]

In [46]:
tweet_hour.count()

tweet_id             1365
timestamp            1365
tweet_text           1365
user_id              1365
tweet_coords          579
tweet_coords_list    1365
tweet_long           1365
tweet_lat            1365
location             1365
enc_url               117
tweet_lang           1365
hashtags              343
dtype: int64

In [47]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [48]:
stopWords = set(stopwords.words('english')) | set(stopwords.words('spanish'))

In [56]:
tweet_vector = TfidfVectorizer(analyzer='word',stop_words=stopWords).fit_transform(tweet_hour['tweet_text'])

In [57]:
tweet_vector

<1365x5488 sparse matrix of type '<class 'numpy.float64'>'
	with 11294 stored elements in Compressed Sparse Row format>

In [58]:
tweet_vector.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.31243485, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [59]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tweet_vector, tweet_vector).flatten()

In [60]:
cosine_similarities.argsort()

array([ 931014, 1231258, 1710653, ..., 1590024,  396140,  549132],
      dtype=int64)