In [None]:
%%capture
!pip install wordcloud
!pip install pyLDAvis
!pip install nltk

In [None]:
%%capture
import pandas as pd
import numpy as np
import nltk
import regex as re
from nltk import download, FreqDist
from nltk.tokenize import TweetTokenizer
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import gensim
import gensim.corpora as corpora
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

nltk.download('words')
nltk.download('wordnet')
nltk.download('vader_lexicon')
nltk.download('stopwords')

In [None]:
stop_words=stopwords.words('english')

In [None]:
tweets_df = pd.read_csv('global_warming_tweets.csv', lineterminator='\n')

In [None]:
tweets_df = tweets_df.loc[:, ~tweets_df.columns.str.contains('^Unnamed')]

In [None]:
tweets_df = tweets_df.dropna()

In [None]:
tweets_df

In [None]:
# Remove emojis

def remove_emoji(tweet):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', tweet)

In [None]:
# remove punctuations

def clean_tweets(tweet):
  # 
  # lowercase the text
  # tweet = tweet.lower()
  # print('tweets:::', type(tweet)) 
  # remove mentions
    tweet = re.sub('@[\w]*','',tweet) 

    # remove hashtags
    tweet = re.sub("#[A-Za-z0-9_]+","", tweet)

    # remove urls
    tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet)

    # remove punctions
    tweet = re.sub(r'[^\w\s]', '', tweet)

    # remove numbers
    tweet = re.sub(r'\d+', '', tweet)

    # remove front space
    tweet = tweet.lstrip(' ')
    
    tweet = remove_emoji(tweet)

    return tweet

tweets_df['content'] = tweets_df['content'].apply(lambda tweet: clean_tweets(tweet))

In [None]:
tweets_df.shape

In [None]:
# Generate Sentiments of the Tweets

sid = SentimentIntensityAnalyzer()

tweets_df['Positive Score'] = tweets_df['content'].apply(lambda tweet: sid.polarity_scores(tweet)['pos'])
tweets_df['Neutral Score'] = tweets_df['content'].apply(lambda tweet: sid.polarity_scores(tweet)['neu'])
tweets_df['Negative Score'] = tweets_df['content'].apply(lambda tweet: sid.polarity_scores(tweet)['neg'])
tweets_df['Polarity'] = tweets_df['content'].apply(lambda tweet: sid.polarity_scores(tweet)['compound'])

tweets_df['Sentiment'] = ''

tweets_df.loc[tweets_df['Polarity'] > 0,'Sentiment']='Positive'
tweets_df.loc[tweets_df['Polarity'] == 0,'Sentiment']='Neutral'
tweets_df.loc[tweets_df['Polarity'] <0,'Sentiment']='Negative'

In [None]:
tweets_df[tweets_df['Sentiment'] == 'Positive'].shape

In [None]:
tweets_df[tweets_df['Sentiment'] == 'Neutral'].shape

In [None]:
tweets_df[tweets_df['Sentiment'] == 'Negative'].shape

In [None]:
tweets_df['date'] = pd.to_datetime(tweets_df['date']).dt.date

In [None]:
%matplotlib inline
tweets_list = []
years = [2015, 2016, 2017, 2018, 2019, 2020]

for year in years:
    tweets_list.append(tweets_df[(pd.DatetimeIndex(tweets_df['date']).year == year)].shape[0])

plt.xlabel('Years')
plt.ylabel('Tweets')
plt.title('Increase in #climatechange and #globalwarming Tweets')

plt.plot(years,tweets_list)


In [None]:
positive_tweets = []
negative_tweets = []
neutral_tweets = []



for year in years:
    positive_tweets.append(tweets_df[(pd.DatetimeIndex(tweets_df['date']).year == year) & (tweets_df['Sentiment'] == 'Positive')].shape[0])
    negative_tweets.append(tweets_df[(pd.DatetimeIndex(tweets_df['date']).year == year) & (tweets_df['Sentiment'] == 'Negative')].shape[0])
    neutral_tweets.append(tweets_df[(pd.DatetimeIndex(tweets_df['date']).year == year) & (tweets_df['Sentiment'] == 'Neutral')].shape[0])


plt.xlabel('Years')
plt.ylabel('Tweets')
plt.title('Increase in Increase in #climatechange and #globalwarming Tweets')
plt.plot(years, positive_tweets, label='Positive Tweets')
plt.plot(years, negative_tweets, label='Negative Tweets')
plt.plot(years, neutral_tweets, label='neutral Tweets')

plt.legend()

In [None]:
# pie chart of sentiments 
pie_labels = ['Positive', 'Neutral', 'Negative']
pie_values = [tweets_df[tweets_df['Sentiment'] == 'Positive'].shape[0], tweets_df[tweets_df['Sentiment'] == 'Neutral'].shape[0], tweets_df[tweets_df['Sentiment'] == 'Negative'].shape[0]]

plt.pie(pie_values, labels=pie_labels, radius=1.5, autopct='%0.2f%%', shadow=True, colors=['#ff9999','#66b3ff','#99ff99'])

In [None]:
# Tokenisation of tweets and remove stop words
stop_words.extend(['rt', 'amp'])
def tokenize_tweet(tweet):

    # print('clean_tweet::',clean_tweet)
    token_list = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles= True).tokenize(tweet)
    clean_tweet = [w for w in token_list if not w.lower() in stop_words]
    return clean_tweet
tweets_df['Tokenised'] = tweets_df['content'].apply(lambda text: tokenize_tweet(text))

In [None]:
def tweet_token_cloud(tweet_list):
    comment_words = ''
    for tokens in tweet_list:

    # Converts each token into lowercase
        for i in range(len(tokens)):
            tokens[i] = tokens[i].lower()

        comment_words += " ".join(tokens)+" "

    return comment_words


In [None]:
# Creating Word Cloud for all the tweets
stopwords = set(STOPWORDS)

stopwords.update(['rt']) 
wordcloud = WordCloud(width = 1600, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 15).generate(tweet_token_cloud(tweets_df['Tokenised']))
 
# plot the WordCloud image                      
plt.figure(figsize = (16, 8), facecolor = None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
 
plt.show()

In [None]:
id2word = corpora.Dictionary(tweets_df['Tokenised'])

# Creating Term Document Frequency 
corpus = [id2word.doc2bow(text) for text in tweets_df['Tokenised']]

In [None]:
# LDA mode training

from pprint import pprint

# Set Number of Topics

lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=6)


In [None]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
import pickle 

pyLDAvis.enable_notebook()
gensimvis.prepare(lda_model, corpus, id2word)

In [None]:
# Topic modeling on negative tweets
id2word_negative = corpora.Dictionary(tweets_df[tweets_df['Sentiment'] == 'Negative']['Tokenised'])

# Creating Term Document Frequency 
corpus_negative = [id2word_negative.doc2bow(text) for text in tweets_df[tweets_df['Sentiment'] == 'Negative']['Tokenised']]

In [None]:
from collections import OrderedDict
def get_doc_topic_dist(model, corpus, kwords=False): 
    '''
    LDA transformation, for each doc only returns topics with non-zero weight
    This function makes a matrix transformation of docs in the topic space.
    
    model: the LDA model
    corpus: the documents
    kwords: if True adds and returns the keys
    '''
    top_dist =[]
    keys = []
    for d in corpus:
        tmp = {i:0 for i in range(6)}
        tmp.update(dict(model[d]))
        vals = list(OrderedDict(tmp).values())
        top_dist += [np.asarray(vals)]
        if kwords:
            keys += [np.asarray(vals).argmax()]

    return np.asarray(top_dist), keys

In [None]:
top_dist, lda_keys= get_doc_topic_dist(lda_model, corpus, True)
tweets_df['Topic'] = pd.DataFrame(lda_keys)