# TASAM: Sentiment Analysis, Data Acquisition & Understanding

## 1. Import libraries and load data

In [1]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')


import nltk
import string
import tweepy
import twitter
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from dotenv import load_dotenv
from pathlib import Path

env_path = Path('/Users/alejandroarrya/Desktop/Ironhack/DAPTMX/keys/twitter_keys.env')
load_dotenv(dotenv_path=env_path)

True

In [2]:
common = pd.read_csv('./data/dos_ta_merged.csv', index_col=0)
common.head()

Unnamed: 0,Name,ISO_TA,Score_TA,Status_TA,Last Update_TA,Source_TA,Score_DOS,Status_DOS,Last Update_DOS,Source_DOS
0,Afghanistan,AF,5.0,Do Not Travel,2020-03-11,https://www.travel-advisory.info/afghanistan,4.0,Do Not Travel,2019-10-22,http://travel.state.gov/content/travel/en/trav...
1,Albania,AL,1.5,Exercise Normal Precautions,2020-03-11,https://www.travel-advisory.info/albania,1.0,Exercise Normal Precautions,2019-07-10,http://travel.state.gov/content/travel/en/trav...
2,Algeria,DZ,2.8,Exercised Increased Caution,2020-03-11,https://www.travel-advisory.info/algeria,2.0,Exercise Increased Caution,2019-04-09,http://travel.state.gov/content/travel/en/trav...
3,Andorra,AD,1.3,Exercise Normal Precautions,2020-03-11,https://www.travel-advisory.info/andorra,1.0,Exercise Normal Precautions,2019-08-27,http://travel.state.gov/content/travel/en/trav...
4,Angola,AO,2.0,Exercise Normal Precautions,2020-03-11,https://www.travel-advisory.info/angola,1.0,Exercise Normal Precautions,2019-04-09,http://travel.state.gov/content/travel/en/trav...


## 2. Authenticate, Twitter API

In [3]:
CONSUMER_KEY = os.getenv('CONSUMER_KEY')
CONSUMER_SECRET = os.getenv('CONSUMER_SECRET')
ACCESS_TOKEN = os.getenv('ACCESS_TOKEN')
ACCESS_TOKEN_SECRET = os.getenv('ACCESS_TOKEN_SECRET')

auth = tweepy.OAuthHandler(CONSUMER_KEY, CONSUMER_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

tw = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

In [4]:
try:
    tw.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


## 3. Extract data

In [56]:
sources = 'from:USATODAY OR from:WSJ OR from:nytimes OR from:washingtonpost OR from:latimes OR from:chicagotribune OR from:guardian OR from:thetimes OR from:Telegraph OR from:FinancialTimes OR from:SCMPNews OR from:timesofindia OR from:japantimes OR from:smh OR from:dwnews OR from:BBCWorld OR from:AP OR from:globeandmail OR from:Reuters'


In [76]:
tw_corpus = pd.DataFrame(columns=['Tweets','Name'])

for name in common.Name.values:
    search_query = f'{name} {str(sources)}'
    corpus = tw.search(q=search_query, count=100,
                       result_type = "recent",
                       lang = "en",
                       include_entities=False,
                       tweet_mode='extended')
    tw_corpus.append([{'Tweets':[tweet.full_text for tweet in corpus]},{'Name': name}], ignore_index=True)


Rate limit reached. Sleeping for: 554


## 4. Clean data

In [103]:
tw_corpus['Name'] = tw_corpus['Name'].shift(-1)
tw_corpus = tw_corpus.dropna(axis=0,how='all')
tw_corpus = tw_corpus.drop(['level_0','index'], axis=1)
tw_corpus = pd.DataFrame(tw_corpus.Tweets.tolist(), index=tw_corpus.Name).stack().reset_index(level=1, drop=True).reset_index(name='Tweets')[['Tweets','Name']]

In [156]:
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text)
    text = re.sub(r'#', '', text)
    text = re.sub(r'RT[\s]+', '', text)
    text = re.sub(r'https?:\/\/\S+', '', text)
    
    return text

In [157]:
tw_corpus['Tweets'] = tw_corpus['Tweets'].apply(clean_text)

In [160]:
tw_corpus.head(10)

Unnamed: 0,Tweets,Name
0,The coronavirus doesn’t fit the templates of 9...,Afghanistan
1,The man with the gun in his hand had jail time...,Afghanistan
2,Pakistan's interior ministry said it will clos...,Afghanistan
3,Afghanistan’s Kashmir Fallout\n\nHow will like...,Afghanistan
4,"Afghanistan is preparing to release 1,500 Tali...",Afghanistan
5,Afghan President Ashraf Ghani signed a decree ...,Afghanistan
6,The Taliban have overtaken Islamic State as th...,Afghanistan
7,President Ashraf Ghani of Afghanistan ordered ...,Afghanistan
8,Afghan government to release Taliban prisoners...,Afghanistan
9,Hundreds of British troops will be withdrawn f...,Afghanistan


In [161]:
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  
         u"\U0001F300-\U0001F5FF"  
         u"\U0001F680-\U0001F6FF"  
         u"\U0001F1E0-\U0001F1FF"  
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)

emoticons = emoticons_happy.union(emoticons_sad)

In [212]:
def clean_tweets(tweet):
 
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)
    
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
    tweet = emoji_pattern.sub(r'', tweet)
    
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []
    
    for w in word_tokens:
        if w not in stop_words and w not in emoticons and w not in string.punctuation:
            filtered_tweet.append(w)
    
    return ' '.join(filtered_tweet)

In [213]:
tw_corpus['Tweets'] = tw_corpus['Tweets'].apply(clean_tweets)

## 5. Export data

In [223]:
tw_corpus.to_csv('/Users/alejandroarrya/Desktop/Ironhack/DAPTMX/final_project/TASAM_final_project/data/tw_corpus.csv')

