# DS4DD training: text analysis

this is a comment

## Grab dataset

In [1]:
import pandas as pd
import json
from sklearn.feature_extraction.text import CountVectorizer

def sort_words_by_count(word_counts, feature_names):
    df = pd.DataFrame(word_counts.A, columns=feature_names).transpose()
    df.columns = ['count']
    df = df.sort_values(by='count', ascending=False)
    return df


# Read dataset into a Pandas dataframe
infile = '../data/twitter/20200711140613_covid-5g/20200711140613_covid-5g_tweets.json'
with open(infile) as json_file:
    jdata = json.load(json_file)
df = pd.DataFrame.from_dict(jdata, orient='index')
df.reset_index(level=0, inplace=True)
df.columns = ['url', 'text']
df

Unnamed: 0,url,text
0,https://twitter.com/quintamistico/status/12820...,RT @Loli99704518: @francis66675984 @rafapal Pu...
1,https://twitter.com/Sparta2427/status/12820584...,RT @KateShemirani: Oh really? Just like with r...
2,https://twitter.com/kayfabe25/status/128205832...,RT @KateShemirani: Oh really? Just like with r...
3,https://twitter.com/Fher69Rojas_/status/128205...,@AlbertoRodNews Recordemos #5Jul Día de la ind...
4,https://twitter.com/hearnlp/status/12820577962...,RT @KateShemirani: Oh really? Just like with r...
5,https://twitter.com/WilmaRubble3/status/128205...,RT @Walletwalking1: @Sterling2143 @AAureilus A...
6,https://twitter.com/phillyfangirl/status/12820...,RT @ADDiane: Let's tell the people who won't w...
7,https://twitter.com/tryst_me/status/1282056124...,Discourse in the UK re face coverings:\n\nThey...
8,https://twitter.com/booouille/status/128205603...,RT @ADDiane: Let's tell the people who won't w...
9,https://twitter.com/MegaloSoto/status/12820559...,La humanidad está más tonta que nunca. Ya tene...


## Analyse all the text as one big string

In [2]:
# Let's have a look at *all* the tweets together... 
alltext = [' '.join(df['text'].to_list())]
alltext[0][:1000]

"RT @Loli99704518: @francis66675984 @rafapal Pues sí. Ya podríamos los españoles tomar ejemplo de los serbios. El covid forma parte de un en… RT @KateShemirani: Oh really? Just like with radiation poisoning then. Put enough symptoms down on the diagnosis sheet and you can just abo… RT @KateShemirani: Oh really? Just like with radiation poisoning then. Put enough symptoms down on the diagnosis sheet and you can just abo… @AlbertoRodNews Recordemos #5Jul Día de la independencia y brote del COVID-5G en un desfile sin tapabocas, pensaron que el uniforme era inmune 🧫 RT @KateShemirani: Oh really? Just like with radiation poisoning then. Put enough symptoms down on the diagnosis sheet and you can just abo… RT @Walletwalking1: @Sterling2143 @AAureilus Anyone noticed COVID symptoms are same as 5G exposure. What have they been rolling out in the… RT @ADDiane: Let's tell the people who won't wear masks that it's not for covid, it's for tricking the facial recognition software that dee… Discourse

In [17]:
count_vect = CountVectorizer()
word_counts = count_vect.fit_transform(alltext)

print('{}'.format(word_counts))

  (0, 2849)	1
  (0, 9086)	1
  (0, 745)	1
  (0, 2300)	1
  (0, 12174)	1
  (0, 12113)	1
  (0, 7931)	1
  (0, 7223)	1
  (0, 2815)	1
  (0, 6676)	1
  (0, 9153)	1
  (0, 8663)	1
  (0, 12205)	1
  (0, 2386)	1
  (0, 10849)	1
  (0, 957)	2
  (0, 8143)	1
  (0, 4760)	1
  (0, 10591)	1
  (0, 6799)	1
  (0, 5609)	1
  (0, 7998)	1
  (0, 1957)	1
  (0, 5623)	1
  (0, 8480)	1
  :	:
  (0, 6644)	124
  (0, 6172)	141
  (0, 9484)	54
  (0, 8102)	31
  (0, 6234)	12
  (0, 3830)	223
  (0, 11759)	134
  (0, 8402)	10
  (0, 4528)	7
  (0, 2657)	3173
  (0, 3737)	512
  (0, 10225)	5
  (0, 2983)	1181
  (0, 3731)	5
  (0, 11432)	5
  (0, 3993)	4
  (0, 6768)	484
  (0, 8762)	4
  (0, 12558)	51
  (0, 11002)	13
  (0, 9172)	8
  (0, 9377)	5
  (0, 4567)	4
  (0, 6743)	3
  (0, 9881)	2108


In [9]:
print('{}'.format(count_vect.vocabulary_))



In [11]:
# The raw data is hard to read, so here's a bit of prettifying code
feature_names = count_vect.get_feature_names()
sort_words_by_count(word_counts, feature_names)

Unnamed: 0,count
covid,3173
5g,2953
rt,2108
the,1837
19,1710
de,1181
co,1175
https,1039
to,1009
and,935


## Take out stopwords

'to', 'and', 'is' etc don't add much to the data. We can remove them 

Also we can use our own stopwords, e.g. 

from sklearn.feature_extraction import text
my_stop_words = text.ENGLISH_STOP_WORDS.union(my_words)

vectorizer = TfidfVectorizer(analyzer=u'word',max_df=0.95,lowercase=True,stop_words=set(my_stop_words),max_features=15000)
X= vectorizer.fit_transform(text)

In [12]:
from sklearn.feature_extraction import text
xxx = list(text.ENGLISH_STOP_WORDS)
xxx.sort()
print('{}'.format(xxx))

['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de', 'describe', 'detail', 'do', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give

In [13]:
count_vect = CountVectorizer(stop_words='english')
word_counts = count_vect.fit_transform(alltext)
feature_names = count_vect.get_feature_names()
sort_words_by_count(word_counts, feature_names)[:20]

Unnamed: 0,count
covid,3173
5g,2953
rt,2108
19,1710
https,1039
que,678
la,560
el,512
del,496
los,484


## Look at bigrams

* Bigram = pair of words
* Trigram = sometimes three words together, but usually means three letters together (used to help with spelling differences etc)

In [14]:
count_vect = CountVectorizer(ngram_range =(2, 2))
word_counts = count_vect.fit_transform(alltext)
sort_words_by_count(word_counts, count_vect.get_feature_names())

Unnamed: 0,count
covid 19,1690
https co,1028
con covid,582
bill gates,289
venezuela diosdado,193
los del,193
cabello con,193
añez con,193
19 los,193
diosdado cabello,193


## Look at text per tweet

In [15]:
# Let's count up the words in each tweet
count_vect = CountVectorizer()
word_counts = count_vect.fit_transform(df['text'])

print('{}'.format(word_counts))

  (0, 3830)	1
  (0, 11759)	1
  (0, 8402)	1
  (0, 4528)	1
  (0, 2657)	1
  (0, 3737)	1
  (0, 10225)	1
  (0, 2983)	2
  (0, 3731)	1
  (0, 11432)	1
  (0, 3993)	1
  (0, 6768)	2
  (0, 8762)	1
  (0, 12558)	1
  (0, 11002)	1
  (0, 9172)	1
  (0, 9377)	1
  (0, 4567)	1
  (0, 6743)	1
  (0, 9881)	1
  (1, 427)	1
  (1, 1844)	1
  (1, 12611)	1
  (1, 815)	1
  (1, 10302)	1
  :	:
  (3832, 3988)	1
  (3832, 11007)	1
  (3832, 4050)	1
  (3832, 1960)	1
  (3832, 8504)	1
  (3832, 1798)	1
  (3832, 5517)	1
  (3832, 3800)	1
  (3832, 2307)	1
  (3832, 12049)	1
  (3832, 877)	1
  (3832, 7658)	1
  (3832, 8370)	2
  (3832, 8354)	1
  (3832, 7871)	1
  (3832, 7124)	1
  (3832, 1025)	1
  (3832, 3934)	1
  (3832, 9282)	2
  (3832, 244)	2
  (3832, 2657)	1
  (3832, 2983)	1
  (3833, 2849)	1
  (3833, 2261)	1
  (3833, 5363)	1


In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True)
tf_features = tf_transformer.fit_transform(word_counts)
print('{}'.format(tf_features))

# Named Entity Recognition

You'll need to both pip install spacy and also do 

python -m spacy download en_core_web_sm

In [18]:
import spacy
nlp = spacy.load('en_core_web_sm') 
  
sentence = "Bill Gates lives near Seattle"
doc = nlp(sentence) 
for ent in doc.ents: 
    print(ent.text, ent.start_char, ent.end_char, ent.label_) 

Bill Gates 0 10 PERSON
Seattle 22 29 GPE
