## <span style='color:green'> Task 3 : Data Exploration & Preprocessing, Topic Modeling & Sentiment Analysis</span>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

In [None]:
# read the csv file generated from clean_tweets_dataframe.py
tweets_df = pd.read_csv("data/clean_processed_tweet_data.csv")

# 3.1 Data Exploration

In [None]:
# display the first 5 rows from our dataset
tweets_df.head()

In [None]:
# display dataframe information
tweets_df.info()

In [None]:
# check null values
tweets_df.isna()

In [None]:
# shape of the dataframe
tweets_df.shape

In [None]:
# show columns of the dataframe
tweets_df.columns

In [None]:
# drop empty values
tweets_df.dropna()

In [None]:
# tweets_df["source"] = tweets_df["source"].str.replace(r"(\s*\<.*?\>\s*)", " ").str.strip()
#tweets_df['Text'] = tweets_df['source'].str.replace(r"\<.*\>?","")
tweets_df['Text'] = tweets_df['Text'].str.replace(r"\(<^()>*\)", "", regex=True)

In [None]:
tweets_df

In [None]:
# Visualize polarity column using piechart and barchart
# Pie chart, where the slices will be ordered and plotted counter-clockwise:
labels = ['neutral', 'positive', 'negative']

neutral_count =  len(tweets_df[tweets_df['polarity'] > 0])
positive_count = len(tweets_df[tweets_df['polarity'] == 0])
negative_count = len(tweets_df[tweets_df['polarity'] <0])
sizes = [neutral_count, positive_count, negative_count]
# Create a figure for 2 subplots (1 row, 2 columns)
fig, ax = plt.subplots(1, 2, figsize = (10,4))

# Create a bar plot of score column
ax[0].bar(x=labels, height=[neutral_count, positive_count, negative_count], color='orange')
ax[0].set_title('Barchart of score column')
ax[0].set_xticklabels(labels, rotation=90)

# Create a pie chart of score column based on neutral, positive or negative

ax[1].pie(sizes,labels=labels, autopct='%1.1f%%',
        shadow=True, startangle=90)
ax[1].set_title('Piechart of score column')
ax[1].legend(labels)

# Add a title to the Figure
fig.suptitle('Score column plots')

# Show the figure
fig.show()

# 3.2 Data Preprocessing

In [None]:
# add new column named clean_text to store cleaned original text
tweets_df.insert(4,column = 'clean_text',value = tweets_df['original_text'])

In [None]:
tweets_df.head()

In [None]:
# add new column named sentimnt to where the text is positive,negative or neutral
# tweets_df.insert(7,column = 'sentiment',value = tweets_df['polarity'])
tweets_df

In [None]:
# 
tweets_df = tweets_df[['original_text','clean_text','polarity']]

In [None]:
def text_category (polarity):
    if polarity > 0:
        return 'positive'
    if polarity < 0:
        return 'negative'
    else:
        return 'neutral'

In [None]:
score= pd.Series([text_category(row_value) for row_value in tweets_df['polarity']])
tweets_df = pd.concat([tweets_df, score.rename('sentiment')], axis=1)
tweets_df.head()

In [None]:
#pip install gensim

In [None]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import gensim
from gensim.models import CoherenceModel
from gensim import corpora
from pprint import pprint
import string
import re
import emoji
import nltk
#nltk.download('words')
words = set(nltk.corpus.words.words())

In [None]:

def cleaner(tweet):
    tweet = re.sub("@[A-Za-z0-9]+","",tweet) #Remove @ sign
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet) #Remove http links
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI) #Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ") #Remove hashtag sign but keep the text
    tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
         if w.lower() in words or not w.isalpha())
    return tweet

tweets_df['clean_text'] = tweets_df['original_text'].map(lambda x: cleaner(x))
tweets_df.head()

In [None]:
class PrepareData:
  def __init__(self,df):
    self.df=df
    
  def preprocess_data(self):
    #tweets_df = self.df.loc[self.df['lang'] =="en"]

    
    #text Preprocessing
    tweets_df['clean_text']=tweets_df['clean_text'].astype(str)
    tweets_df['clean_text'] = tweets_df['clean_text'].apply(lambda x: x.lower())
    tweets_df['clean_text']= tweets_df['clean_text'].apply(lambda x: x.translate(str.maketrans(' ', ' ', string.punctuation)))
    
    #Converting tweets to list of words For feature engineering
    sentence_list = [tweet for tweet in tweets_df['clean_text']]
    word_list = [sent.split() for sent in sentence_list]
    # print(word_list)

    #Create dictionary which contains Id and word 
    word_to_id = corpora.Dictionary(word_list) #generate unique tokens
    #  we can see the word to unique integer mapping
    # print(word_to_id.token2id)
    # using bag of words(bow), we create a corpus that contains the word id and its frequency in each document.
    corpus_1= [word_to_id.doc2bow(tweet) for tweet in word_list]
    # TFIDF

    return tweets_df['clean_text'],word_list, word_to_id, corpus_1

In [None]:
PrepareData_obj=PrepareData(tweets_df)
tweets_df['clean_text'],word_list ,id2word,corpus=PrepareData_obj.preprocess_data()

In [None]:
#print(corpus)
tweets_df

In [None]:
id_words = [[(id2word[id], count) for id, count in line] for line in corpus]

In [None]:
# print(id_words)

# 3.3 Topic Modelling using Latent Dirichlet Allocation(LDA)
#### The purpose of LDA is mapping each teweets in our corpus to a set of topics which covers a good deal of the words in the tweet


In [311]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [312]:
pprint(lda_model.print_topics())

[(0,
  '0.072*"the" + 0.052*"is" + 0.042*"china" + 0.033*"this" + 0.032*"in" + '
  '0.028*"of" + 0.028*"s" + 0.026*"…" + 0.024*"that" + 0.022*"on"'),
 (1,
  '0.053*"to" + 0.037*"…" + 0.036*"the" + 0.030*"it" + 0.024*"of" + 0.023*"s" '
  '+ 0.022*"not" + 0.018*"island" + 0.018*"in" + 0.017*"we"'),
 (2,
  '0.069*"on" + 0.043*"china" + 0.042*"’" + 0.039*"s" + 0.036*"of" + 0.036*"a" '
  '+ 0.035*"…" + 0.024*"missile" + 0.023*"military" + 0.021*"an"'),
 (3,
  '0.042*"part" + 0.038*"out" + 0.033*"speaker" + 0.032*"t" + 0.030*"’" + '
  '0.028*"for" + 0.027*"their" + 0.023*"…" + 0.022*"5" + 0.020*"and"'),
 (4,
  '0.075*"the" + 0.062*"to" + 0.042*"…" + 0.041*"s" + 0.038*"china" + '
  '0.035*"and" + 0.029*"us" + 0.024*"visit" + 0.022*"a" + 0.021*"of"')]


In [313]:
pprint(lda_model.show_topics(formatted=False))

[(0,
  [('the', 0.0720394),
   ('is', 0.051672734),
   ('china', 0.04224037),
   ('this', 0.033075012),
   ('in', 0.032355797),
   ('of', 0.027817756),
   ('s', 0.027659101),
   ('…', 0.026197946),
   ('that', 0.023988543),
   ('on', 0.021984821)]),
 (1,
  [('to', 0.052805908),
   ('…', 0.03654342),
   ('the', 0.03555156),
   ('it', 0.030056885),
   ('of', 0.024076967),
   ('s', 0.02344202),
   ('not', 0.022206457),
   ('island', 0.018373588),
   ('in', 0.017870707),
   ('we', 0.016598973)]),
 (2,
  [('on', 0.06912865),
   ('china', 0.043411385),
   ('’', 0.04203578),
   ('s', 0.039337996),
   ('of', 0.035956223),
   ('a', 0.03587063),
   ('…', 0.035150617),
   ('missile', 0.024428003),
   ('military', 0.023469845),
   ('an', 0.020680407)]),
 (3,
  [('part', 0.041534945),
   ('out', 0.038097065),
   ('speaker', 0.03316909),
   ('t', 0.031809844),
   ('’', 0.029987387),
   ('for', 0.027728545),
   ('their', 0.026711373),
   ('…', 0.02333698),
   ('5', 0.02189074),
   ('and', 0.019878184

### Model Analysis 

In [None]:
# # Compute Perplexity

# #It's a measure of how good the model is. The lower the better. Perplexity is a negative value
# print('\nPerplexity: ', lda_model.log_perplexity(corpus))  
# doc_lda = lda_model[corpus]


# # Compute Coherence Score
# coherence_model_lda = CoherenceModel(model=lda_model, texts=word_list, dictionary=id2word, coherence='c_v')
# coherence_lda = coherence_model_lda.get_coherence()
# print('\n Ldamodel Coherence Score/Accuracy on Tweets: ', coherence_lda)

In [None]:
# %pip install pyLDAvis 

In [None]:
import pyLDAvis.gensim_models as gensimvis
import pickle 
import pyLDAvis
# Visualize the topics
pyLDAvis.enable_notebook()

LDAvis_prepared = gensimvis.prepare(lda_model, corpus, id2word)
LDAvis_prepared

In [None]:
# !pip install wordcloud

In [None]:
import sys
print(sys.executable)
from wordcloud import WordCloud

In [None]:
long_string = ','.join(list(tweets_df['clean_text'].values))

wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

wordcloud.generate(long_string)

# Visualize the word cloud
wordcloud.to_image()

# 3.4 Sentiment Analysis

In [None]:
import numpy as np 
import pandas as pd 
import re
import nltk 
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# # plot params
# plot_size = plt.rcParams["figure.figsize"] 
# plot_size[0] = 8
# plot_size[1] = 6
# plt.rcParams["figure.figsize"] = plot_size 

In [None]:
# # number of tweets for each airline
# sns.set(rc={'figure.figsize':(14,10)})
# tweets_df.polarity.value_counts().plot(kind='pie', autopct='%1.0f%%')

In [None]:
# # Distribution of sentiments across all the tweets.
# sns.set(rc={'figure.figsize':(14,10)})
# tweets_df.sentiment.value_counts().plot(kind='pie', autopct='%1.0f%%', colors=["red", "yellow", "green"])

In [None]:
# sns.set(rc={'figure.figsize':(12,8)})
# sentiment = tweets_df.groupby(['Polarity', 'Sentiment']).sentiment.count().unstack()
# sentiment.plot(kind='bar')

In [None]:
# # Selecting the feature and the label
# features = tweets_df.iloc[:, 10].values
# labels = tweets_df.iloc[:, 1].values

In [None]:
# # Data Cleaning using regular expression
# processed_features = []

# for sentence in range(0, len(features)):
#     # Remove all the special characters
#     processed_feature = re.sub(r'\W', ' ', str(features[sentence]))

#     # remove all single characters
#     processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

#     # Remove single characters from the start
#     processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature) 

#     # Substituting multiple spaces with single space
#     processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

#     # Removing prefixed 'b'
#     processed_feature = re.sub(r'^b\s+', '', processed_feature)

#     # Converting to Lowercase
#     processed_feature = processed_feature.lower()

#     processed_features.append(processed_feature)

In [None]:
# from sklearn.linear_model import SGDClassifier
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import TfidfVectorizer

# vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
# processed_features_vectorized = vectorizer.fit_transform(processed_features).toarray()

In [None]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(processed_features_vectorized, labels, test_size=0.2, random_state=0)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
# text_classifier.fit(X_train, y_train)

In [None]:
# predictions = text_classifier.predict(X_test)

In [None]:
# predictions[:10]

In [None]:
# from sklearn.metrics import accuracy_score

# print(accuracy_score(y_test, predictions))

In [None]:
X = tweets_df['clean_text']
y = tweets_df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
print(X_train.size)
print(X_test.size)
print(y_train.size)
print(y_test.size)

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import plot_confusion_matrix

In [None]:
# scaling the input
clf = make_pipeline(StandardScaler(), SGDClassifier(max_iter=1000, tol=1e-2))

In [None]:
count_vect = CountVectorizer(1,2)
#X_train = X_train.replace(np.nan, '', regex=True)
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts = X_train_counts.toarray()

In [None]:
clf.fit(X_train_counts, y_train)

In [None]:
X_test = X_test.replace(np.nan, '', regex=True)
# use transform not fit_transform
X_test_counts = count_vect.transform(X_test)
X_test_counts = X_test_counts.toarray()
# prediction = clf.prevaluedict(X_test_counts)
prediction = clf.predict(X_test_counts)

In [None]:
# making prediction
prediction = clf.predict(X_test_counts)

In [None]:
np.mean(prediction == y_test)

In [None]:
# plot_confusion_matrix(clf, X_test, y_test, display_labels = ['negative', 'positive','neutral'])
# plt.suptitle('Confusion Matrix')
# plt.show()