# Sentiment Analysis with Keywords identification

### Libraries used for Sentiment Analysis

In [14]:
import pandas as pd
import numpy as np

import textblob
import gensim
from gensim.models.ldamulticore import LdaMulticore
from gensim import corpora, models
from gensim.summarization.summarizer import summarize
from gensim.summarization import keywords

import pyLDAvis.gensim #LDA visualization library
import matplotlib.pyplot as plt
import string

from textblob import TextBlob

from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from pprint import pprint

### Functions definitions

In [15]:
## Function1 START
## Read any EXCEL file, provided the path and sheet name
def read_data_excel(path, sheet):
    df=pd.read_excel(path, sheet_name=sheet)
    print("****Details of the excel sheet below*****")
    print("Total Number of Rows in the excel dataset: ", len(df))
    #print("Sample data in excel Dataset: ", df.head(6))
    print("Column names in the excel Dataset: ", df.columns)
    return df
## Function1 END


## Function1.1 START
## Read any EXCEL file, provided the path and sheet name
def read_data_csv(path):
    df=pd.read_csv(path, error_bad_lines=False)
    print("****Details of the CSV below*****")
    print("Total Number of Rows in the CSV dataset: ", len(df))
    #print("Sample data in CSV Dataset: ", df.head(6))
    print("Column names in the CSV Dataset: ", df.columns)
    return df
## Function1.1 END

## Function2 START
## Preprocessing of Data - conversion to lower case, 
## removing trailing and leading spaces, removing NULLs
def preprocessing_data1(dataset_name):
    dataset_name=dataset_name.apply(lambda x: x.astype(str).str.lower())
    dataset_name=dataset_name.apply(lambda x: x.astype(str).str.strip())
    dataset_name=dataset_name[~dataset_name['Comment_Made'].isnull()]
    return dataset_name
## Function2 END

## Function3 START
## Preprocessing of Data - removing stopwords and special characters
def preprocessing_data2(dataset_name):
    
    #Fetching Stopwords from library nltk(natural language toolkit)
    stopwords1=stopwords.words("english")
    #print("Type of the dataset stopword1 is: ",type(stopwords1))
    
    #Adding any additional stopwords, specific for the particular analysis
    #stopwords1.append(['rocket','rock'])
    stopwords_foc=['something','anything']
    stopwords1.extend(stopwords_foc)
    #print(stopwords1)
    
    #Removing stopwords
    dataset_name['clean_comment']=""
    dataset_name['clean_comment_statement']=""
    for i, row in dataset_name.iterrows():
        clean_comment=[]
        words_comment=TextBlob(dataset_name['Comment_Made'][i])
        words_comment1=words_comment.words
        for words in words_comment1:
            if (words not in stopwords1 and words not in string.punctuation):
                clean_comment.append(words)
            dataset_name['clean_comment'][i]=clean_comment
        #print(dataset_name['clean_comment'][i])
        dataset_name['clean_comment_statement'][i]=" ".join(clean_comment)
        #print(dataset_name['clean_comment_statement'][i])
        
    return dataset_name
## Function3 END


## Function3.1 START
## Stemming of Data
def preprocessing_data3(dataset_name):
    #print("***************In the Stemming function******************")
    ps = PorterStemmer()
    dataset_name['Stemming_words']=""
    dataset_name['Stemming_words_statements']=""
    #Stemming of text in user comments
    for i,row in dataset_name.iterrows():
        stemmed_words=[]
        words_textblob=word_tokenize(dataset_name['clean_comment_statement'][i])
        for words in words_textblob:
            #print(words, " : ", ps.stem(words))
            stemmed_words.append(ps.stem(words))
            dataset_name['Stemming_words'][i]=stemmed_words
        dataset_name['Stemming_words_statements'][i]=" ".join(stemmed_words)
    return dataset_name
## Function3.1 END


## Function4 START
## Sentiment Analysis usinh polarity
def sentiment_analysis(dataset_name):
    dataset_name['Polarity']=""
    dataset_name['Sentiment']=""
    dataset_name['Subjectivity']=""
    dataset_name['Polarity_cat']=""
    
    for i,row in dataset_name.iterrows():
        blob=TextBlob(dataset_name['clean_comment_statement'][i])
        dataset_name['Polarity'][i]=blob.polarity
        dataset_name['Sentiment'][i]=blob.sentiment
        dataset_name['Subjectivity'][i]=blob.subjectivity
        
        #assigning categories for Polarity
        if dataset_name['Polarity'][i] < 0:
            dataset_name['Polarity_cat'][i]="Negative"
        if dataset_name['Polarity'][i] > 0:
            dataset_name['Polarity_cat'][i]="Positive"
        if dataset_name['Polarity'][i] ==0:
            dataset_name['Polarity_cat'][i]="Neutral"
    #print(dataset_name.head(10))
    
    print("Average Sentiment of the whole text: ", dataset_name['Polarity'].mean())
    return dataset_name
## Function4 END


## Function5 START
## Creating Word Cloud
def word_cloud_fn(dataset_name):
    #Word Cloud formation
    txt=str(dataset_name['clean_comment_statement'])
    txt1=str(dataset_name['Stemming_words_statements'])
    wordcloud=WordCloud(width=800, height=800,
                       background_color='white',
                       min_font_size=10).generate(txt)
    
    #Plot the WordCloud image
    plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.show()
    return dataset_name
## Function5 END


## Function5.1 START
## Feature Extraction
def keyword_extract_fn(dataset_name):
    #Converting the whole comments column into a text column
    write_path_keyword="C:/Users/-----/User Comments/Keyword_found.csv"
    sentence_text=" ".join(dataset_name['Stemming_words_statements'])
    keyword_found=keywords(sentence_text, words=10).split('\n')
    #print("Type of Keyword ds: ", type(keyword_found))
    #print("length of Keyword ds: ", len(keyword_found))
    print("Keywords found in the whole text: ", keyword_found)
    keyword_found_df=pd.DataFrame(keyword_found, columns=['Keywords'])
    keyword_found_df.to_csv(write_path_keyword, index=False)
        
    return keyword_found 
## Function5.1 END

## Function6 START
## Group by blog name
def group_by_fn(dataset_name):
    #Group by command to group by Blog_Name
    dataset_name1=dataset_name.groupby("Blog_Name")["Comment_Made"].transform(lambda x: " ".join(x))
    dataset_name1=dataset_name1.drop_duplicates
    return dataset_name1
## Function6 END

## Function7 START
## Writing the file into a CSV
def write_csv_fn(dataset_name1, write_path):
    #Write the file into a CSV
    dataset_name1.to_csv(write_path)
## Function7 END

#set path and sheet name
path="C:/Users/------/User_comments_on_leader_posts Aug10.csv"
sheet="Sheet1"
write_path="C:/Users/-----/User Comments/Sentiment_analysis2.csv"

### The 'main()' function and function calls

In [17]:
if __name__=='__main__':
    #Function1 CALL
    #comments_data=read_data_excel(path, sheet)
    
    #Function1.1 CALL
    comments_data=read_data_csv(path)
    
    #Function2 CALL
    comments_data_processed=preprocessing_data1(comments_data)
    
    #Function3 CALL
    #print(comments_data_processed.apply(lambda x: x.astype(str)))
    comments_data_processed=preprocessing_data2(comments_data_processed)

    #Function3.1 CALL
    comments_data_processed=preprocessing_data3(comments_data_processed)
    
    #Function4 CALL
    comments_data_processed=sentiment_analysis(comments_data_processed)
    
    #Function5 CALL
    #word_cloud_fn(comments_data_processed)
    
    #Function5.1 CALL
    keyword_found=keyword_extract_fn(comments_data_processed)
    
    #Function6 CALL
    grouped_data_processed=group_by_fn(comments_data_processed)
    
    #Function7 CALL
    write_csv_fn(comments_data_processed, write_path)

****Details of the CSV below*****
Total Number of Rows in the CSV dataset:  120
Column names in the CSV Dataset:  Index(['Blog_Name', ' Commented_On_Date', 'Comment_Made', 'Commented_By_TM'], dtype='object')
Average Sentiment of the whole text:  0.4188396473756321
Keywords found in the whole text:  ['love', 'awesom', 'news', 'new', 'team', 'thank', 'great', 'detroit', 'solar', 'compani', 'help']
