In [None]:
%run mod_twitter_streaming

In [None]:
import pandas as pd
import numpy as np

data = pd.read_json('twitter.json')              # save file name as twitter in mod_twitter_streaming file
pd.set_option('display.max_columns', 500)        # to display all columns
#pd.set_option('display.max_colwidth', -1)       # to display everything in cell

In [None]:
data.head()

In [None]:
data.info()

In [None]:
twitter = data.loc[:,['text','source', 'possibly_sensitive','retweet_count', 'favorite_count','display_text_range', 'lang']]
twitter

In [None]:
language = twitter.groupby('lang', as_index = False).count()
language = pd.DataFrame(language)
language

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize = (15,7))
twitter.groupby('lang').text.count().plot.bar(ax=ax, color = 'skyblue').grid()
plt.title('Languages used in Twitter')
plt.xlabel('Languages')
plt.ylabel('Counts of tweets')


In [None]:
tweets = data['text']
tweets

In [None]:
import re

def remove_features(text):   
    
    text = re.sub('(@)\w+', '', text)              # Removing @mentions
    text = re.sub('RT :', '', text)                # Removing RT 
    text = re.sub('https?://\S+', '', text)        # Removing hyperlink
    text = text.lower()                            # Lowercase everything
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)     # Keep alphanumeric and space only 
    
    return text

tweets = tweets.apply(remove_features)
tweets

In [None]:
# Splits all the sentences up which makes it easier to work with

all_sentences = []

for word in tweets:
   all_sentences.append(word) 

all_sentences

In [None]:
lines = list()

for line in all_sentences:
    words = line.split()
    for w in words:
        lines.append(w)
        
print(lines)

In [None]:
lines2 = []

for word in lines:
    if word != '':
        lines2.append(word)
        
lines2

In [None]:
# This is stemming the words to their root

import nltk
from nltk.stem.wordnet import WordNetLemmatizer

#using lemmatizer instead of stemmer as many last letter 'e' at the end of the words are removed
wnl = WordNetLemmatizer()

stem = []
for word in lines2:
    stem.append(wnl.lemmatize(word))
    
stem

In [None]:
# Removing stop words

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

stem2 = []

#for word in stem:
#    if word not in nlp.Defaults.stop_words:
#        stem2.append(word)

for w in stem:
    if w not in stop_words:
        stem2.append(w)
stem2

In [None]:
words = pd.DataFrame(stem2)
words = words[0].value_counts()

words.sort_values(ascending=False)
words

In [None]:
import matplotlib.pyplot as plt; plt.rcdefaults()
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Drop words that doesnt make sense from lemmetization

words = words.drop(['de', 'amp'])
words

In [None]:
#To show the top 20 words being used

words = words[:20,]
plt.figure(figsize=(10,5))
sns.barplot(words.values, words.index, alpha=0.8)
plt.title('Top Words Overall')
plt.ylabel('Word from Tweet', fontsize=12)
plt.xlabel('Count of Words', fontsize=12)
plt.show()

In [None]:
import spacy
nlp = spacy.load("en_core_web_lg")

str1 = " " 
stem2 = str1.join(lines2)

stem2 = nlp(stem2)

label = [(X.text, X.label_) for X in stem2.ents]

data_word_entity = pd.DataFrame(label, columns = ['Word','Entity'])

data_word_entity = data_word_entity.where(data_word_entity['Entity'] == 'PERSON')

data_name = data_word_entity['Word'].value_counts()

In [None]:
data_name = data_name[:10,]
plt.figure(figsize=(10,5))
sns.barplot(data_name.values, data_name.index, alpha=0.8)
plt.title('Top People Mentioned')
plt.ylabel('Word from Tweet', fontsize=12)
plt.xlabel('Count of Words', fontsize=12)
plt.show()

In [None]:
data_word_entity.dropna()

People mentioned analysis is not relevant due to languages other than english

In [None]:
from textblob import TextBlob

# Create a function to get the subjective or opinionated the text is

def getSubjectivity(text):
   return TextBlob(text).sentiment.subjectivity

# Create a function to get the polarity, positive/negative the text is

def getPolarity(text):
   return  TextBlob(text).sentiment.polarity


# Create two new columns 'Subjectivity' & 'Polarity'

data['Subjectivity'] = data['text'].apply(getSubjectivity)
data['Polarity'] = data['text'].apply(getPolarity)

# Show the new dataframe with columns 'Subjectivity' & 'Polarity'
data

In [None]:
# Create a function to compute negative (-1), neutral (0) and positive (+1) analysis

def getAnalysis(score):
    if score < 0:
      return 'Negative'
    elif score == 0:
      return 'Neutral'
    else:
      return 'Positive'

data['Analysis'] = data['Polarity'].apply(getAnalysis)
# Show the dataframe
data

In [None]:
data.Analysis.value_counts()

In [None]:
for i in range (0, data.shape[0]):
    plt.scatter(data["Polarity"][i], data["Subjectivity"][i], color = 'Orange')

plt.title('Sentiment Analysis')
plt.grid()
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

In [None]:
# Plotting and visualizing the counts
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Counts')
data['Analysis'].value_counts().plot(kind = 'bar')
plt.show()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Word cloud visualization
#allWords = ' '.join([twts for twts in str(tweets)])
wordCloud =  WordCloud(width = 500, height = 300, random_state = 21, max_font_size = 110).generate(str(tweets))

plt.imshow(wordCloud, interpolation = "bilinear")
plt.axis('off')
plt.show()