## Working with Language Processing Methods

In [1]:
import pandas as pd
import numpy as np

In [2]:
columns = ['tweet_id','timestamp','tweet_text','user_id',
           'tweet_coords','tweet_coords_list','tweet_long','tweet_lat','location',
           'enc_url','tweet_lang','hashtags']

`%timeit` has the following command at:
`3.02 s ± 29 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)`
This is much faster than attempting to explicitly define the string for date formatting, and using `pd.to_datetime()`.

In [3]:
tweet_data = pd.read_csv(r'./tweetCoords.csv',header=None,names=columns,parse_dates=[1],infer_datetime_format=True)

In [4]:
time_string = "2017-09-01 00:00:00"
delta_hours = 1
start_time = pd.to_datetime(time_string)

Getting information in a single hour.

In [5]:
tweet_hour = tweet_data[(tweet_data['timestamp'] >= start_time) &
                        (tweet_data['timestamp'] <= start_time + pd.Timedelta(hours=delta_hours))].copy()

removing extraneous columns for this analysis.

In [6]:
tweet_hour.drop(columns=tweet_hour.columns[3:],axis=1,inplace=True)

In [7]:
tweet_hour.count()

tweet_id      1365
timestamp     1365
tweet_text    1365
dtype: int64

In [8]:
pd.options.display.max_colwidth=100

Working on functionality to clean tweet text.
- Eliminate links.
- Drop user mentions.
- *amp* as ampersand (this may need to be removed later)
- remove non-word characters (ascii x21-x40, x5B-x60, x7B-x7F)


In [9]:
tweet_hour['tweet_text'] = tweet_hour['tweet_text'].str.replace("https?:\S+","").str.replace("@[\S]+","").str.replace('\s+amp\s+'," ").str.replace("[\x21-\x40,\x5B-\x60,\x7B-\x7F]","").str.replace("\s+"," ").str.lower()

In [26]:
tweet_hour['tweet_text']

0                                                                                           ocala pm sunset
1                                    wind mph ese barometer in steady temperature f rain today in humidity 
2                                                                             where words fallmusic speaks 
3                                                            first with my bride lovetampa bucs buccaneers 
4                             wow that was rough it s basically drinking a shot of whiskey beer minute ipa 
5       i can t even watch diana programmes because it s just so sad she was an incredible person and wo...
6                                                                                     gainesville pm sunset
7                                                   exactly hrs til my blessings the world famous original 
8                                       i m at louis pappas market cafe shoppes at citrus park in tampa fl 
9       don t try talk me wh

In [10]:
from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
stopWords = set(stopwords.words('english')) | set(stopwords.words('spanish'))

In [12]:
tweet_vector = CountVectorizer(analyzer='word',stop_words=stopWords).fit(tweet_hour['tweet_text'])

In [13]:
len(tweet_vector.get_feature_names())

4078

In [32]:
# !pip install gensim

In [28]:
import nltk
from gensim import models

In [55]:
tweet_hour['tweet_text'].str.split("\s")

0                                                                                       [ocala, pm, sunset]
1                      [wind, mph, ese, barometer, in, steady, temperature, f, rain, today, in, humidity, ]
2                                                                       [where, words, fallmusic, speaks, ]
3                                                   [first, with, my, bride, lovetampa, bucs, buccaneers, ]
4            [wow, that, was, rough, it, s, basically, drinking, a, shot, of, whiskey, beer, minute, ipa, ]
5       [i, can, t, even, watch, diana, programmes, because, it, s, just, so, sad, she, was, an, incredi...
6                                                                                 [gainesville, pm, sunset]
7                                        [exactly, hrs, til, my, blessings, the, world, famous, original, ]
8                       [i, m, at, louis, pappas, market, cafe, shoppes, at, citrus, park, in, tampa, fl, ]
9       [don, t, try, talk, 

In [41]:
tweet_vectors = models.Word2Vec(sentences=tweet_hour['tweet_text'].str.split("\s"))

In [56]:
tweet_vectors['humidity']

  """Entry point for launching an IPython kernel.


array([-8.56943429e-02,  1.71005219e-01,  1.25435174e-01,  9.41894054e-02,
        6.79508131e-03, -2.96751052e-01,  1.39484525e-01,  4.80349027e-02,
        1.37049041e-03, -4.05695587e-02, -1.61292851e-02,  9.13755503e-03,
       -5.69108836e-02,  9.36218947e-02,  8.45170170e-02,  1.22037619e-01,
        1.63309112e-01, -5.45028411e-02, -1.01546533e-01, -2.33544298e-02,
        1.95600122e-01, -1.64324120e-01,  1.76773712e-01, -7.42312334e-03,
       -1.08739361e-04,  6.40345886e-02, -6.88408017e-02,  6.69599771e-02,
       -1.01036660e-01,  9.44380164e-02,  1.24531679e-01,  5.00229709e-02,
       -1.48975685e-01,  1.16967045e-01, -1.13418333e-01, -1.19179241e-01,
        1.51960865e-01, -3.02866906e-01,  5.43619273e-03, -2.36533191e-02,
       -1.83139354e-01, -6.91374168e-02, -9.99659672e-02, -8.54811631e-03,
       -3.43231373e-02,  1.22006036e-01, -6.22599237e-02,  7.45435208e-02,
        3.14938091e-02,  3.07027735e-02,  2.78947145e-01,  8.08070879e-03,
       -6.00209236e-02, -

In [17]:
FreqDist(tweet_hour['tweet_text'])

