In [2]:
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
import pandas as pd
import numpy as np

In [4]:
tweet_text = pd.read_csv(r'./tweetsText.csv')

In [5]:
tweet_text.columns

Index(['tweet_id', 'text'], dtype='object')

In [6]:
tweet_text.head(10).text

0                                Ocala: 7:50pm: sunset
1    Wind 2.0 mph ESE. Barometer 30.013 in, Steady....
2                  Where words fall....music speaks   
3    First @TBBuccaneers with my bride @carrie_duna...
4    Wow. That was rough. It s basically drinking a...
5    I can t even watch #Diana20 programmes because...
6                          Gainesville: 7:51pm: sunset
7    Exactly 4hrs til  my blessings... @ The World ...
8    I m at Louis Pappas Market Cafe: Shoppes at Ci...
9    Don t try  amp  talk 2 me when it s convenient...
Name: text, dtype: object

In [13]:
tweet_text.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 784322 entries, 0 to 784321
Data columns (total 2 columns):
tweet_id    784322 non-null int64
text        784322 non-null object
dtypes: int64(1), object(1)
memory usage: 12.0+ MB


### Working with TfIdf - Term frequency/Inverse document frequency

In [8]:
stopWords = set(stopwords.words('english')) | set(stopwords.words('spanish'))

In [9]:
tweet_vector = TfidfVectorizer(analyzer='word',stop_words=stopWords).fit_transform(tweet_text['text'])

In [10]:
tweet_vector

<784322x862839 sparse matrix of type '<class 'numpy.float64'>'
	with 6706018 stored elements in Compressed Sparse Row format>

In [11]:
tweet_vector[0:10000].toarray()

array([[0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.2693774, 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       ...,
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ],
       [0.       , 0.       , 0.       , ..., 0.       , 0.       ,
        0.       ]])

In [14]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tweet_vector[0:5], tweet_vector).flatten()

In [15]:
cosine_similarities.argsort()

array([1960804, 1899055, 1899056, ..., 3137292, 1568646, 2352969])

### Tweet Language:

TextBlob will determine the language of text, but requires that the analyzed text be at least 3 characters. For example, tweet below is causing an error.

In [16]:
len(tweet_text.iloc[756,1])

1

In [17]:
tweet_text.head(31).apply(lambda x: TextBlob(x['text']).detect_language(),axis=1)

0     en
1     en
2     en
3     en
4     en
5     en
6     en
7     en
8     en
9     en
10    en
11    en
12    en
13    en
14    en
15    en
16    en
17    en
18    en
19    en
20    en
21    en
22    en
23    en
24    en
25    en
26    en
27    en
28    en
29    en
30    pt
dtype: object

In [18]:
tweet_text.iloc[30]

tweet_id                                   903407384160346113
text        @jaguairs Passei no lugar que Loren costuma qu...
Name: 30, dtype: object

defining a function to preserve the short tweets, and avoid the error due to string length.

In [19]:
def getLang(text_sample):
    if len(text_sample) < 3:
        return np.nan
    else:
        return TextBlob(text_sample).detect_language()

There seems to be a timeout issue when processing large amounts of tweets. May be caused by API limits? Testing with increasing numbers here.

In [None]:
tweet_text['Lang'] = tweet_text[:10].apply(lambda x: getLang(x['text']),axis=1)

Trying to circumvent the API limitations with an iterator. (Using the `stop_point` variable to check on progress, and start over later.)

In [95]:
tweet_text.to_csv(r'./tweets_with_lang.csv')

In [None]:
stop_point = 36605
for i in range(stop_point,tweet_text.shape[0]):
    tweet_text.iloc[i,2] = getLang(tweet_text.iloc[i,1])

In [96]:
tweet_text[tweet_text['Lang'].notnull()].iloc[-1]

tweet_id                     904056350472273921
text        @lolitathelionn come over today :- 
Lang                                         en
Name: 36604, dtype: object

In [94]:
tweet_text.iloc[36605]

tweet_id                                   904056357153902593
text        Taco City      awesome. Enjoyed celebrating th...
Lang                                                      NaN
Name: 36605, dtype: object

In [91]:
tweet_text[tweet_text['Lang'].notnull()]['Lang'].groupby(tweet_text['Lang']).count()

Lang
af          12
ar          57
az           3
bg           8
bs           6
ca          25
ceb          5
co           2
cs           3
cy          21
da          39
de          54
el          14
en       33170
eo           8
es        1912
et          14
eu           5
fi          27
fr          76
fy          15
ga           5
gd           6
gl          34
ha           1
haw         12
hi          23
hmn          5
hr           6
ht          33
         ...  
lv          12
mg          10
mi          71
ms          12
mt           9
nl          41
no          24
ny           1
pl          20
pt         288
ro          36
ru           9
sk          10
sl           2
sm           1
sn           1
so          22
sq          16
st           2
su           8
sv          55
sw           8
tl          66
tr          12
uz          10
vi          20
xh           2
yo           2
zh-CN        4
zu          13
Name: Lang, Length: 72, dtype: int64

Language processing seems to be inconsistent.

In [None]:
tweet_lang.groupby(tweet_lang).count()

In [None]:
tweet_words = tweet_text.text.str.lower().str.split(r'\s+',expand=True).stack().value_counts()

In [None]:
stop_words = set(stopwords.words('english')) | set(stopwords.words('spanish'))

In [None]:
stop_list = list(stop_words)

In [None]:
tweet_words[tweet_words.index.str.len() > 3][:200]

In [None]:
tweet_words[~(tweet_words.index.isin(stop_list))].head(20)

In [None]:
tweet_words[~(tweet_words.index.isin(stop_list)) & (tweet_words.index.str.len() > 3)].head(20)

#### Twitter Sentiment Analysis Testing

In [None]:
tweet_text.head(10).apply(lambda x: TextBlob(x['text']).sentiment.polarity,axis=1)

In [None]:
tweet_text.head(10).apply(lambda x: TextBlob(x['text']).sentiment.subjectivity,axis=1)

Experimenting with a large sentiment analysis dataset. Attempting to use the Twitter Sentiment Analysis Dataset Corpus obtained from http://thinknook.com/twitter-sentiment-analysis-training-corpus-dataset-2012-09-22/

Twitter Corpus has some extraneous quotation marks that affect parsing.

In [None]:
# import re

# new_file = []

# re_string = '^(\d+,\d+,\w+,)(.+)$'
# g = open('.\sentiment_corrected.csv','w')
# g.seek(0)
# with open('.\Sentiment Analysis Dataset.csv','r') as f:
#     lines = f.readlines()
       
# for line in lines:
#     line = line.replace('"',"'")
#     line = re.sub(re_string,r'\1"\2"',line)
#     g.writelines(line)

# f.close()
# g.close()


In [None]:
twitter_corpus = pd.read_csv(r'./sentiment_corrected.csv')

In [None]:
twitter_corpus.head()

Corpus text is in alphabetical order. Using this to experiment with 60/20/20 split for train/test/val

In [None]:
train_sample = np.split(twitter_corpus.sample(frac=1),[int(.6*len(twitter_corpus)),int(.8*len(twitter_corpus))])