In [1]:
import pandas as pd
import re
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import lazypredict
from sklearn import metrics
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import precision_score, f1_score, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import text 
from nltk.corpus import stopwords
from wordcloud import WordCloud
from textblob import TextBlob 

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

ModuleNotFoundError: No module named 'sklearn.utils.testing'

In [None]:
import sklearn
estimators = sklearn.utils.all_estimators(type_filter=None)
for name, class_ in estimators:
    if hasattr(class_, 'predict_proba'):
        print(name)


In [None]:
df = pd.read_csv('Vault_diversity_reviews - Vault_diversity_reviews.csv',index_col='Unnamed: 0')
df.head()

In [None]:
df=df.drop(['company_rating','title', 'author','year'],axis=1)

# DataClean

In [None]:
def clean_text(text):
    '''
    Uses regex for removing punctuation and special characters.
    '''
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\w*\d\w*', ' ', text)
    text = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', text, flags=re.MULTILINE)

    return text

re_clean = lambda x: clean_text(x)

In [None]:
df['clean_review'] = pd.DataFrame(df.content.apply(re_clean))
df

In [None]:
# Creating own library of stop words
add_stop_words = ['and', 'to', 'that','the','in','with','firm']
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

In [None]:
#Count vectorizing all reviews. 
cv = CountVectorizer(stop_words=stop_words, ngram_range = (1,2))
data_cv = cv.fit_transform(df.clean_review)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = df.index
data_dtm

In [None]:
# Function to create dictionaries with column names
def top_words(column_name):
    dictionary={}
    brands=list(df['company'].unique())
    for brand in brands:
        tweet=''
        for index in df[df['company']==brand].index:
            tweet+=df[column_name][index]+' '
        dictionary[brand]=tweet
    return dictionary

# We are going to change this to key: brand, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ''.join(list_of_text)
    return combined_text

# We can either keep it in dictionary format or put it into a pandas dataframe
def dict_to_dataframe(data_combined, column_name):
    data_df = pd.DataFrame.from_dict(data_combined).transpose()
    data_df.columns = [column_name]
    data_df = data_df.sort_index()
    return data_df

# Find the top 30 words said by each company
def top_30_words(tk_dataframe):
    top_dict = {}
    for word in tk_dataframe.columns:
        top = tk_dataframe[word].sort_values(ascending=False).head(30)
        top_dict[word] = list(zip(top.index, top.values))
    return top_dict

In [None]:
#Combining all the company's reviews and cleanig the text.
dict_review = top_words('clean_review')
data_combined_review = {key: [combine_text(value)] for (key, value) in dict_review.items()}
data_company = dict_to_dataframe(data_combined_review,'reviews')

In [None]:
data_company

# EDA

In [None]:
# Instantiate CountVectorizer
cv = CountVectorizer(stop_words=stop_words, ngram_range = (2,2))

# Fit and transform dataframe without data cleaning
data_review_cv = cv.fit_transform(data_company.reviews)
tk_reviews = pd.DataFrame(data_review_cv.toarray(), columns = cv.get_feature_names())
tk_reviews.index = data_company.index

# Transpose dataframes
tk_reviews = tk_reviews.transpose()

# Applying top_30_words function
top_words_reviews = top_30_words(tk_reviews)

In [None]:
# Print the top 15 words
def top_15_words(top_words_dict):
    for brand, top_words in top_words_dict.items():
        print(brand)
        print(', '.join([word for word, count in top_words[0:14]])+'\n')

In [None]:
print('Top Words Per company\n')
top_15_words(top_words_reviews)

In [None]:
#Checking out the word cloud for each company.
wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

plt.rcParams['figure.figsize'] = [50, 50]

brands = list(top_words_reviews.keys())
for index, brand in enumerate(tk_reviews.columns):
    wc.generate(data_company.reviews[brands[0]])
    plt.subplot(4,4,index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(brands[index] ,fontsize=22)
    plt.savefig('mytable.png')
plt.show()



# TextBlobSentiment

In [None]:
# Create a function to get subjectivity
def getSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity

# Create a function to get polarity
def getPolarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
df['subjectivity_review'] = df['clean_review'].apply(getSubjectivity)
df['polarity_review'] = df['clean_review'].apply(getPolarity)

In [None]:
df

In [None]:
# Creates a new column comparing the text blob results to the data set original analysis
df['emotion_textblob'] = df['polarity_review'].apply(lambda x: 'Negative Emotion' if x < 0 else 'Neutral Emotion' if x == 0 else 'Positive emotion')
df['target'] = df['polarity_review'].apply(lambda x: 0 if x < 0 else 1 if x == 0 else 2)

In [None]:
df

In [None]:
#Finding the final sentiment of the company.
def final_sentimet(column_name):
    dictionary={}
    brands=list(df['company'].unique())
    for brand in brands:
        polarity=[]
        for index in df[df['company']==brand].index:
            polarity.append(df[column_name][index])
        dictionary[brand]=sum(polarity)/len(polarity)
    sentiment= pd.DataFrame.from_dict(dictionary,orient="index").reset_index()
    sentiment.columns=['company','average_polarity']
    sentiment['company_sentiment']=sentiment['average_polarity'].apply(lambda x: 'Negative Emotion' if x < 0 else 'Neutral Emotion' if x == 0 else 'Positive emotion')
    return(sentiment)
    

In [None]:
#Compan's average sentiment polarity 
final_sentimet('polarity_review')

# TFIDF 

In [None]:
# Evaluation function

def evaluation(y_true, y_pred):
       
# Print Accuracy, Recall, F1 Score, and Precision metrics.
    print('Evaluation Metrics:')
    print('Accuracy: ' + str(metrics.accuracy_score(y_true, y_pred)))
    print('F1 Score: ' + str(metrics.f1_score(y_true, y_pred, average="micro")))

In [None]:
X=df["clean_review"]
y=df["target"]

In [None]:
#Train test split with TFIDF
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)
tfidf = TfidfVectorizer()

tfidf_data_train = tfidf.fit_transform(X_train)
tfidf_data_test = tfidf.transform(X_test)


In [None]:
# Instantiating Logistic Regression with Max Iter of 10000 so the model can reach convergence
log_reg=LogisticRegression(max_iter=100000, random_state=40, class_weight="balanced")

In [None]:
# Fitting and predicting results.
log_reg.fit(tfidf_data_train,y_train)

log_pred=log_reg.predict(tfidf_data_test)

In [None]:
#Creating confusion matrix.
from sklearn.metrics import confusion_matrix
import seaborn as sns; sns.set()
plt.rcParams['figure.figsize'] = [5, 5]



mat = confusion_matrix(y_test,log_pred) 
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
plt.xlabel('true label')
plt.ylabel('predicted label');

In [None]:
#Final Evaluation.
evaluation(y_test, log_pred)