# Scraping twitter using Selenium

In [8]:
#please make sure you have install chromedriver
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

In [9]:
browser = webdriver.Chrome()

query = u'transgene'
language = u'en'
url = u'https://twitter.com/search?vertical=default&q='+query+u'&l='+ language

browser.get(url)
time.sleep(1)

body = browser.find_element_by_tag_name('body')
for _ in range(100):
    body.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.2)

tweets = browser.find_elements_by_class_name('tweet-text')


texts = []
for tweet in tweets:
    texts.append(tweet.text)

In [10]:
len(texts)

173

In [11]:
texts

['Landing pads for transgene insertion. A nice example here in tomato. ',
 'The DFR locus: A smart landing pad for targeted transgene insertion in tomato (coll IJPB, SPS) | @scoopit https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0208395&utm_medium=social&utm_source=twitter …',
 'The DFR locus: A smart landing pad for targeted transgene insertion in tomato (coll IJPB, SPS) | @scoopit',
 'Nucleotide sequence and pharmaceutical composition based thereon with prolonged VEGF transgene expression ',
 'Very happy to share our first paper on genome editing by CRISPR-Cas9. Deletion and reconstruction with targeted insertion of the DFR gene in tomato through HDR. The DFR locus: A smart landing pad for targeted transgene insertion in tomato http://dx.plos.org/10.1371/journal.pone.0208395 …',
 'weirdly no, I think that the transgene is slowing down growth.',
 'Fock state-based method for ultraluminous transgene',
 '#Transgene Announces Upcoming #InvestorMeetings $TNG http://www.

In [12]:
df = pd.DataFrame()
df['Contents'] = texts
df.head()

Unnamed: 0,Contents
0,Landing pads for transgene insertion. A nice e...
1,The DFR locus: A smart landing pad for targete...
2,The DFR locus: A smart landing pad for targete...
3,Nucleotide sequence and pharmaceutical composi...
4,Very happy to share our first paper on genome ...


# Build the model based on SentiWordNet(NLTK), word-list based(unsupervised)

In [13]:
from nltk.corpus import sentiwordnet as swn
from nltk import word_tokenize,pos_tag

def analyze_sentiment_sentiwordnet_lexicon(review, verbose=False):
    # tokenize and POS tag text tokens
    tokens = word_tokenize(review)
    tagged_text = pos_tag(tokens)

    pos_score = neg_score = token_count = obj_score = 0
    # get wordnet synsets based on POS tags
    # get sentiment scores if synsets are found
    for word, tag in tagged_text:
        ss_set = None
        if 'NN' in tag and list(swn.senti_synsets(word, 'n')):
            ss_set = list(swn.senti_synsets(word, 'n'))[0]
        elif 'VB' in tag and list(swn.senti_synsets(word, 'v')):
            ss_set = list(swn.senti_synsets(word, 'v'))[0]
        elif 'JJ' in tag and list(swn.senti_synsets(word, 'a')):
            ss_set = list(swn.senti_synsets(word, 'a'))[0]
        elif 'RB' in tag and list(swn.senti_synsets(word, 'r')):
            ss_set = list(swn.senti_synsets(word, 'r'))[0]
        # if senti-synset is found        
        if ss_set:
            # add scores for all found synsets
            pos_score += ss_set.pos_score()
            neg_score += ss_set.neg_score()
            obj_score += ss_set.obj_score()
            token_count += 1
    
    #final scores
    final_score = pos_score - neg_score
    norm_final_score = round(float(final_score) / token_count, 2)
    final_sentiment = 'positive' if norm_final_score >= 0 else 'negative'
    if verbose:
        norm_obj_score = round(float(obj_score) / token_count, 2)
        norm_pos_score = round(float(pos_score) / token_count, 2)
        norm_neg_score = round(float(neg_score) / token_count, 2)
        # to display results in a nice table
        sentiment_frame = pd.DataFrame([[final_sentiment, norm_obj_score, norm_pos_score, 
                                         norm_neg_score, norm_final_score]],
                                       columns=pd.MultiIndex(levels=[['SENTIMENT STATS:'], 
                                                             ['Predicted Sentiment', 'Objectivity',
                                                              'Positive', 'Negative', 'Overall']], 
                                                             labels=[[0,0,0,0,0],[0,1,2,3,4]]))
        #print(sentiment_frame)
        
    return final_sentiment

In [14]:
import nltk
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords 
from nltk.tokenize import RegexpTokenizer 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Kenny\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [15]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Kenny\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
df['Results'] = df['Contents'].apply(analyze_sentiment_sentiwordnet_lexicon)

In [17]:
msk = df['Results']=='positive'
df['Results'][~msk] = 1
df['Results'][msk] = 0

In [18]:
df[0:100].to_csv('Q5_output.csv')
df.head(10)

Unnamed: 0,Contents,Results
0,Landing pads for transgene insertion. A nice e...,0
1,The DFR locus: A smart landing pad for targete...,0
2,The DFR locus: A smart landing pad for targete...,0
3,Nucleotide sequence and pharmaceutical composi...,0
4,Very happy to share our first paper on genome ...,0
5,"weirdly no, I think that the transgene is slow...",0
6,Fock state-based method for ultraluminous tran...,0
7,#Transgene Announces Upcoming #InvestorMeeting...,0
8,"Thank you for speaking up, Doctor. It is frust...",1
9,Research – bioRxiv Pre-print – Use of MYB as a...,1
