### Cleaning of tweets

In [20]:
import pandas as pd
import numpy as np

In [32]:
df=pd.read_csv('Covid19/March.csv')

In [33]:
import re #regular expressions
import string 
from nltk.corpus import stopwords
from nltk import word_tokenize

In [34]:
#HappyEmoticons
emoticons_happy = set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3'
    ])

# Sad Emoticons
emoticons_sad = set([
    ':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#Emoji patterns
emoji_pattern = re.compile("["
         u"\U0001F600-\U0001F64F"  # emoticons
         u"\U0001F300-\U0001F5FF"  # symbols & pictographs
         u"\U0001F680-\U0001F6FF"  # transport & map symbols
         u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
         u"\U00002702-\U000027B0"
         u"\U000024C2-\U0001F251"
         "]+", flags=re.UNICODE)


#combine sad and happy emoticons
emoticons = emoticons_happy.union(emoticons_sad)

def clean_tweets(tweet):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)
    #removing mentions
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
#replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)
#remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)
#filter using NLTK library append it to a string
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []
#looping through conditions
    for w in word_tokens:
#check tokens against stop words , emoticons and punctuations
        if w not in stop_words and w not in emoticons and w not in string.punctuation:
            filtered_tweet.append(w)
    return ' '.join(filtered_tweet)

In [35]:
#replace tabs and spaces
df['Text']= df['Text'].replace({'\n':" ","\t":" "})
df['Text']

0        In the war against the global epidemic #corona...
1        Dear liberals, Please do not pray for Corona v...
2        Sir there is not treatment available for coron...
3        Replying to #WHO "Coronavirus confirmed as pan...
4        #jharkhand @HemantSorenJMM @WeAreRanchi @WeAre...
                               ...                        
42313    The members of Tableegi jamaat in Sopore hosip...
42314    Iranian state TV said Saturday the new coronav...
42315    #Breaking | 5 more cases of Coronavirus have b...
42316    CORONAVIRUS I'm not scared of the virus, I'm j...
42317    Asalamu Alikum Good decision makers can make i...
Name: Text, Length: 42318, dtype: object

In [8]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aamir\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aamir\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [36]:
import dask.dataframe as dsk
import multiprocessing as mp
from dask.multiprocessing import get
ddf = dsk.from_pandas(df,npartitions=4*mp.cpu_count())

In [37]:
df['Text1'] = df['Text'].apply(clean_tweets) #preprocessing using user defined functions
df.columns

Index(['Unnamed: 0', 'Place', 'Query', 'Datetime', 'Text', 'retweets',
       'favourites', 'hashtags', 'Text1'],
      dtype='object')

In [38]:
df['Text'] = df['Text1']
df = df.drop('Text1', axis = 1)

### Calculating subjectivity and Polarity

In [39]:
from textblob import TextBlob
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [40]:
df['Subjectivity']=df['Text'].apply(getSubjectivity)
df['Polarity']=df['Text'].apply(getPolarity)

In [41]:
df.columns

Index(['Unnamed: 0', 'Place', 'Query', 'Datetime', 'Text', 'retweets',
       'favourites', 'hashtags', 'Subjectivity', 'Polarity'],
      dtype='object')

In [42]:
df.head()

Unnamed: 0.1,Unnamed: 0,Place,Query,Datetime,Text,retweets,favourites,hashtags,Subjectivity,Polarity
0,0,Jharkhand,coronavirus,2020-03-29 16:23:31+00:00,In war global epidemic coronavirus I donated ₹...,1,9,#coronavirus #PMCARES,0.0,0.0
1,1,Jharkhand,coronavirus,2020-03-18 05:40:36+00:00,Dear liberals Please pray Corona virus cases g...,0,0,,0.4,-0.15
2,2,Jharkhand,coronavirus,2020-03-19 08:09:44+00:00,Sir treatment available corona virus Jharkhand,0,0,,0.4,0.4
3,3,Jharkhand,covid19,2020-03-11 18:05:59+00:00,Replying WHO `` Coronavirus confirmed pandemic...,0,2,#WHO #COVID19,0.597222,0.119444
4,4,Jharkhand,coronavirus,2020-03-26 08:26:04+00:00,jharkhand HemantSorenJMM WeAreRanchi WeAreDhan...,0,0,#jharkhand,0.225,-0.058333


In [43]:
df.to_csv('NewMARCH.csv')

### Labelling tweets with IBM Watson Tone Analyzer

In [None]:
from ibm_watson import ToneAnalyzerV3
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator

version = '2020-07-15'
ibmapi = 'apikey'
urlf = 'given urlf from ibm'
authenticator = IAMAuthenticator(ibmapi)
tone_analyzer = ToneAnalyzerV3(
    version=version,
    authenticator=authenticator
)

tone_analyzer.set_service_url(urlf)

import json 
import pandas as pd

df = pd.read_csv("all.csv")
df = pd.DataFrame(df)
df['Text'] = df['Text'].astype(str)

# change the below line
df = df.iloc[start:end] # ---- changes this ----

i = 0
text = "Dear liberals Please pray Corona virus cases grow exponentially petty enmity current incumbent government The situation well controlled At least keep politics aside sake country Regards An Indian"
def sent(text):
    global i
    i += 1
    tone_analysis = tone_analyzer.tone(
        {'text': text},
        content_type='application/json'
    ).get_result()
#     print(json.dumps(tone_analysis, indent=2))
    x = json.dumps(tone_analysis, indent=2)
    y = json.loads(x)
    try:
        y['document_tone']['tones'][0]['tone_name']

    except:
        print(f"{i} Neutral")
        return "Neutral"
    
    
    if y['document_tone']['tones'][0]['tone_name'] == 'Tentative':
        try:
            res = y['document_tone']['tones'][1]['tone_name']
        except:
            res = "Neutral"
    else:
        res = y['document_tone']['tones'][0]['tone_name']
        
    print(f"{i} {res}")
    return res

# sent(text)
df['ibm_sentiment'] = df['Text'].apply(sent)



df.to_csv("filename.csv") # ---- and change this