# Twitter - Get Polarity
In this notebook, we use TextBlob to generate a new column in our dataset for Sentiment that is the polarity of the tweet. This is important to our regression section.

## Preparing data

In [1]:
# Basic Imports
import tweepy
import pandas as pd     
import numpy as np 
import time
import os
import re

# Plotting and Visualization
from IPython.display import display
from IPython.display import clear_output
import matplotlib.pyplot as plt
plt.rcParams["font.family"]="STSong" # for japanese
import seaborn as sns
%matplotlib inline

# TextBlob Imports
from textblob import TextBlob
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier

# NLTK Imports
import nltk
#nltk.download('punkt')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

#Graphs
import plotly as py
import plotly.graph_objs as go
import plotly.express as px


#MAP
from geopy.geocoders import Nominatim
import folium
from folium import plugins
from geopy.geocoders import Nominatim

In [2]:
# Input data file
df = pd.read_csv('sample_data/tweets_output.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,created_at,id,id_str,full_text,source,truncated,in_reply_to_status_id,in_reply_to_status_id_str,in_reply_to_user_id,in_reply_to_user_id_str,...,matching_rules,current_user_retweet,scopes,withheld_copyright,withheld_in_countries,withheld_scope,geo,contributors,display_text_range,quoted_status_permalink
0,Sun Mar 08 14:32:50 +0000 2020,1236661157877800961,1236661157877800961,RT @senrobportman: One thing the threat of the...,"<a href=""http://twitter.com/download/iphone"" r...",False,,,,,...,,,,,,,,,"[0, 140]",
1,Mon Mar 09 19:38:29 +0000 2020,1237100463444672520,1237100463444672520,RT @biby__: Mon beau-frère présente tous les s...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,,"[0, 140]",
2,Tue Mar 10 00:32:22 +0000 2020,1237174421040893954,1237174421040893954,RT @olivierveran: #SoireeBFMTV | Nous avons un...,"<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,,"[0, 139]",
3,Wed Mar 11 11:42:46 +0000 2020,1237705522159239169,1237705522159239169,"RT @ABCCardinal: #Coronavirus #CoronavirusPy ""...","<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,,"[0, 140]",
4,Wed Mar 11 13:04:57 +0000 2020,1237726202896531456,1237726202896531456,"RT @CoronaVid19: Ya os lo digo yo, ¡PARA JODER!","<a href=""http://twitter.com/download/android"" ...",False,,,,,...,,,,,,,,,"[0, 47]","{'url': 'https://t.co/VGGCbGnCkP', 'expanded':..."


In [3]:
len(df)
print(df.columns)

Index(['created_at', 'id', 'id_str', 'full_text', 'source', 'truncated',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'coordinates', 'place',
       'quoted_status_id', 'quoted_status_id_str', 'is_quote_status',
       'quoted_status', 'retweeted_status', 'quote_count', 'reply_count',
       'retweet_count', 'favorite_count', 'entities', 'extended_entities',
       'favorited', 'retweeted', 'possibly_sensitive', 'filter_level', 'lang',
       'matching_rules', 'current_user_retweet', 'scopes',
       'withheld_copyright', 'withheld_in_countries', 'withheld_scope', 'geo',
       'contributors', 'display_text_range', 'quoted_status_permalink'],
      dtype='object')


In [4]:
#Organizing dataframe
tweets_df = pd.DataFrame()

tweets_df['Tweets'] = np.array([tweet for tweet in df['full_text']])
tweets_df['len']  = np.array([len(tweet) for tweet in df['full_text']])
tweets_df['ID']   = np.array([tweet for tweet in df['id']])
tweets_df['Date'] = np.array([tweet for tweet in df['created_at']])
tweets_df['Source'] = np.array([tweet for tweet in df['source']])
tweets_df['Likes']  = np.array([tweet for tweet in df['favorite_count']])
tweets_df['RTs']    = np.array([tweet for tweet in df['retweet_count']])
tweets_df['User Location']    = np.array([location for location in df['place']])
tweets_df['Geo']    = np.array([geo for geo in df['geo']])
tweets_df['Coordinates']    = np.array([coord for coord in df['coordinates']])

tweets_df.head()

Unnamed: 0,Tweets,len,ID,Date,Source,Likes,RTs,User Location,Geo,Coordinates
0,RT @senrobportman: One thing the threat of the...,140,1236661157877800961,Sun Mar 08 14:32:50 +0000 2020,"<a href=""http://twitter.com/download/iphone"" r...",0,883,,,
1,RT @biby__: Mon beau-frère présente tous les s...,140,1237100463444672520,Mon Mar 09 19:38:29 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",0,807,,,
2,RT @olivierveran: #SoireeBFMTV | Nous avons un...,139,1237174421040893954,Tue Mar 10 00:32:22 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",0,4048,,,
3,"RT @ABCCardinal: #Coronavirus #CoronavirusPy ""...",140,1237705522159239169,Wed Mar 11 11:42:46 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",0,5,,,
4,"RT @CoronaVid19: Ya os lo digo yo, ¡PARA JODER!",47,1237726202896531456,Wed Mar 11 13:04:57 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",0,200,,,


In [5]:
# Top Searched Tweets
fav_max = np.max(tweets_df['Likes'])
rt_max  = np.max(tweets_df['RTs'])

fav = tweets_df[tweets_df.Likes == fav_max].index[0]
rt  = tweets_df[tweets_df.RTs == rt_max].index[0]

# Max FAVs:
print("O tweet com mais curtidas é: \n{}".format(tweets_df['Tweets'][fav]))
print("Número de curtidas: {}".format(fav_max))
print("\n")
# Max RTs:
print("O tweet com mais retweet é: \n{}".format(tweets_df['Tweets'][rt]))
print("Número de retweets: {}".format(rt_max))

O tweet com mais curtidas é: 
Koşa koşa umreye gideceğinize birleşip bu çocuklara yurt yapsaydınızya gavatlar. https://t.co/cvglHZ8VYk
Número de curtidas: 36630


O tweet com mais retweet é: 
RT @NicholsUprising: Sicily has figured out this whole self-isolation thing.

#COVID19 #CoronavirusPandemic 

https://t.co/93whPVtQcR
Número de retweets: 201457


## Pre-processing Tweets

In [14]:
def clean_tweet(tweet):
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

## Translating tweets - English

In [6]:
from googletrans import Translator
import httpx
timeout = httpx.Timeout(5) # 5 seconds timeout

#Testing translator
translator = Translator(timeout=timeout)
translation = translator.translate('Estamos estudando Ciência dos Dados')
translation.text

'We are studying Data Science'

In [7]:
def translate_tweet(tweet):
    return translator.translate(tweet).text

# Sentiment Prediction

## Training model

In [8]:
#Sentiment Analysis in Português and English

base_path = './ReLi-Lex'
train = []
wordsPT = []
wordsPT_sentiments = []

files = [os.path.join(base_path, f) for f in os.listdir(base_path)]

for file in files:
    t = 1 if '_Positivos' in file else -1
    with open(file, 'r', encoding= 'latin-1') as content_file:
        content = content_file.read()
        all = re.findall('\[.*?\]',content)
        for w in all:
            wordsPT.append((w[1:-1]))
            wordsPT_sentiments.append(t)
            train.append((w[1:-1], t))

cl = NaiveBayesClassifier(train)
cl

<NaiveBayesClassifier trained on 609 instances>

## TextBlob

In [9]:
def sentiment(tweet):
    
    polarity = 0
    
    #blob = TextBlob(tweet, classifier=cl)
    blob = TextBlob(clean_tweet(tweet), classifier=cl) #Clean Tweets
    
    for s in blob.sentences:
        polarity = s.classify() + polarity
    
    if polarity > 0:
        return 1
    elif polarity < 0:
        return -1
    else:
        return 0

In [10]:
def analize_sentimentEN(tweet):
    
    clean = clean_tweet(tweet)
    if(len(clean) > 3):
        analysis = TextBlob(clean)
    else: 
        analysis = TextBlob(tweet) 
        
    if analysis.detect_language() != 'en':
        analysis = TextBlob(translate_tweet(tweet))
    
    return analysis.sentiment.polarity

In [11]:
#Inicializa SA
SA = []
for i in range(0, len(tweets_df['Tweets'])):
    SA.append(5)

In [16]:
def getLastAnalysis(SA):
    count = 0
    for i in range(0, len(tweets_df['Tweets'])):
        if(SA[i]!=5):
            count += 1
    return count
        
print(getLastAnalysis(SA))

12344


In [15]:
for i in range(getLastAnalysis(SA), 20000):
    if(SA[i] == 5):
        SA[i] = analize_sentimentEN(tweets_df['Tweets'][i])

KeyboardInterrupt: 

In [17]:
tweets_df['SA'] = np.array(SA)
tweets_df.head()

Unnamed: 0,Tweets,len,ID,Date,Source,Likes,RTs,User Location,Geo,Coordinates,SA
0,RT @senrobportman: One thing the threat of the...,140,1236661157877800961,Sun Mar 08 14:32:50 +0000 2020,"<a href=""http://twitter.com/download/iphone"" r...",0,883,,,,0.05
1,RT @biby__: Mon beau-frère présente tous les s...,140,1237100463444672520,Mon Mar 09 19:38:29 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",0,807,,,,0.0
2,RT @olivierveran: #SoireeBFMTV | Nous avons un...,139,1237174421040893954,Tue Mar 10 00:32:22 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",0,4048,,,,0.3
3,"RT @ABCCardinal: #Coronavirus #CoronavirusPy ""...",140,1237705522159239169,Wed Mar 11 11:42:46 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",0,5,,,,0.25
4,"RT @CoronaVid19: Ya os lo digo yo, ¡PARA JODER!",47,1237726202896531456,Wed Mar 11 13:04:57 +0000 2020,"<a href=""http://twitter.com/download/android"" ...",0,200,,,,-0.5


In [18]:
tweets_df = tweets_df[tweets_df['SA'] != 5]
len(tweets_df)

12344

In [20]:
tweets_df.columns

Index(['Tweets', 'len', 'ID', 'Date', 'Source', 'Likes', 'RTs',
       'User Location', 'Geo', 'Coordinates', 'SA'],
      dtype='object')

# Exporting DataFrame to csv

In [19]:
tweets_df.to_csv(r'sample_data/polarity_dataset.csv', index = False)