In [None]:
# more common imports
import pandas as pd
import numpy as np
from collections import Counter
import re
from spellchecker import SpellChecker
import textstat
from textblob import TextBlob

# languange processing imports
import nltk
nltk.download('words')
from gensim.corpora import Dictionary
# preprocessing imports
from sklearn.preprocessing import LabelEncoder
nltk.download('averaged_perceptron_tagger')

# model imports
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.word2vec import Word2Vec
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
# hyperparameter training imports
from sklearn.model_selection import GridSearchCV

# visualization imports
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import base64
import io
%matplotlib inline
sns.set()  # defines the style of 'the plots to be seaborn style
words = set(nltk.corpus.words.words())
import concurrent.futures

In [None]:
reviews = reviews.drop(columns=['Symbol'],axis=1)

In [None]:
reviews = pd.read_csv('Final/reviews_analysed_combined.csv',lineterminator='\n')
reviews =reviews.drop(columns='Unnamed: 0',axis=[1])

In [None]:
reviews.Company.value_counts().count()

In [None]:
# liwc = pd.read_csv('LIWC.csv',header=0,index_col=0)
reviews = pd.read_csv('Reviews/final/reviews_combined.csv')
reviews = reviews.drop_duplicates(subset = 'Text',keep='first')
reviews = reviews[reviews['Text'].notna()]
reviews = reviews.drop(columns=['Symbol'],axis=1)
reviews = reviews[reviews['Text']!='Review']
reviews.isnull().sum()

In [None]:
reviews = pd.read_csv('Final/Reviews_en.csv',index_col=0,lineterminator='\n')

In [None]:

reviews.isnull().sum()

In [None]:
char_length = []
word_length = []
for review in reviews.text_processed:
    words = review.split()
    word_length.append(len(words))
    char_length.append(len(review))
reviews['No_of_Words'] = pd.Series(word_length)
reviews['No_of_Chars'] = pd.Series(char_length)

In [None]:
reviews['Emotiveness'].isnull().sum()

In [None]:
# from langdetect import detect
# from tqdm import tqdm_notebook
# tqdm_notebook().pandas()
# reviews['lang'] = reviews.Text.progress_map(detect)

In [None]:
reviews= reviews[reviews['lang']=='en']
reviews.to_csv('Final/Reviews_en.csv')

In [None]:
for column in reviews.columns:
    print(column)
    print(reviews[column].isnull().sum())
 

In [None]:

from langdetect import detect       
def detectlang(review):
    try:
        return detect(review)
    except:
        return ' '
    
    

In [None]:
language = []
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analyse in enumerate(executor.map(detectlang,reviews.Text)):
        if(i%10000==0):
            print(i)
        language.append(analyse)

In [None]:
reviews['lang'] = pd.Series(language)


In [None]:
harvard = pd.read_csv('Harvard_Modified.csv')
liwc = pd.read_csv('LIWC.csv')
# reviews['lang'] = reviews.Text.progress_map(detect)

In [None]:
categories_harvard = ['Active','Passive','Need','Goal','Try','Means','Persist','Complet','Finish','PowGain','PowLoss','PowCon','PowCoop'
             ,'PowAuth','PowTot','NegAff','PosAff','SureLw','Self','Our','You']

# harvard_modified = pd.DataFrame()
categories_dict_harvard = {}
# for category in categories_harvard:
#     harvard_modified[category] = pd.Series(list(harvard[category].dropna()))
for category in categories_harvard:
    edited = []
    for word in harvard[category]:
        word = str(word).split('#')[0]
        word = ' ' + word + ' '
        edited.append(word)
    edited_series = pd.Series(edited)
    edited_series = pd.Series(edited_series.unique())
    harvard[category] = edited_series.dropna()
#     harvard_modified[category] = harvard_modified[category].unique()

for category in categories_harvard:
    values = pd.Series(list(harvard[category].dropna()))
    categories_dict_harvard[category] = values
    
our_self_you = list(categories_dict_harvard['Our']) + list(categories_dict_harvard['Self']) +list(categories_dict_harvard['You'])
our_self = list(categories_dict_harvard['Our']) + list(categories_dict_harvard['Self'])
categories_dict_harvard['Our_Self_You'] = pd.Series(our_self_you).unique()
categories_dict_harvard['Our_Self'] = pd.Series(our_self).unique()

categories_harvard_avg = []
for category in categories_harvard:
    categories_harvard_avg.append(category+'_avg')


In [None]:
categories_liwc = ['Money','Work','Swear','Anger','Anx','Sad','Tentat','Certain','FocusPast','FocusPresent',
              'FocusFuture','Posemo','Negemo','Percept','See','Hear','Feel']

categories_dict_liwc = {}
for category in categories_liwc:
    edited = []
    for word in liwc[category]:
#         word = str(word).split('#')[0]
#         word = ' ' + str(word) + ' '
        edited.append(word)
    edited_series = pd.Series(edited)
    edited_series = pd.Series(edited_series.unique())
    liwc[category] = edited_series.dropna()
#     harvard_modified[category] = harvard_modified[category].unique()

for category in categories_liwc:
    values = pd.Series(list(liwc[category].dropna()))
    categories_dict_liwc[category] = values
    
categories_dict_liwc['pisr'] = pd.Series(list(categories_dict_liwc['Percept'])+list(categories_dict_liwc['Hear']) +
                                         list(categories_dict_liwc['See'])+ 
                                         list(categories_dict_liwc['Feel'])).unique()

categories_liwc_avg = []
for category in categories_liwc:
    categories_liwc_avg.append(category+'_avg')
    

In [None]:
reviews = pd.concat([reviews,pd.DataFrame(columns=categories_liwc)])
reviews = pd.concat([reviews,pd.DataFrame(columns=categories_harvard)])
reviews = pd.concat([reviews,pd.DataFrame(columns=categories_liwc_avg)])
reviews = pd.concat([reviews,pd.DataFrame(columns=categories_harvard_avg)])

In [None]:
# #employment status
# employ = []
# for review in reviews.Designation:
#     status = str(review).split('(')
#     if(len(status)>1):
#         employ.append(status[1].split(')')[0])
#     else:
#         employ.append('')
# reviews['Employment_Status'] = pd.Series(employ) 
def status(desig):
    if 'Former' in str(desig):
        return 0
    elif 'Current' in str(desig):
        return 1
    else:
        return float('NaN')
    
status1 = []
import concurrent.futures
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analyse in enumerate(executor.map(status,reviews['Designation'])):
#         print(i)
        status1.append(analyse)
reviews['Employment_Status'] = pd.Series(status1)
#         reviews[i,'Employment_status'] = analyse

In [None]:
# Remove punctuation
reviews['text_processed'] = reviews['Text'].map(lambda x: re.sub('[\(\),\.!?]+', '', x))
# Convert the titles to lowercase
reviews['text_processed'] = reviews['Text'].map(lambda x: x.lower())
reviews['text_processed'] = reviews['Text'].map(lambda x: ' ' + x + ' ')

In [None]:
def get_good_tokens(sentence):
    replaced_punctation = list(map(lambda token: re.sub('[^0-9A-Za-z!?]+', '', token), sentence))
    removed_punctation = list(filter(lambda token: token, replaced_punctation))
    return removed_punctation

In [None]:
nltk.download('punkt')
executor = concurrent.futures.ProcessPoolExecutor() 
def lda_get_good_tokens(df):
    df['Text'] = df.Text.str.lower()
    df['tokenized_text'] = list(map(nltk.word_tokenize, df.Text))
    df['tokenized_text'] = list(map(get_good_tokens, df.tokenized_text))
#     df['tags'] = list(map(nltk.pos_tag,df.tokenized_text))

lda_get_good_tokens(reviews)

In [None]:
executor = concurrent.futures.ProcessPoolExecutor() 
reviews['tags'] = list(executor.map(nltk.pos_tag,reviews.tokenized_text))

In [None]:
type(reviews.tokenized_text[0])

In [None]:
reviews.to_csv('Final/Reviews_processed.csv')

In [None]:
spell = SpellChecker()
count = []
for words in reviews.tokenized_text:
    errors = len(spell.unknown(words))
    count.append(errors)
reviews['Spell_Errors'] = pd.Series(count)

In [None]:
def emotion(review):
    nouns = len([item[0] for item in review if item[1][0] == 'N'])
    adv = len([item[0] for item in review if item[1][0] == 'R'])
    adj = len([item[0] for item in review if item[1][0] == 'J'])
    verb = len([item[0] for item in review if item[1][0] == 'V'])
    try:
        return (adj+adv)/(nouns+verb)
    except:
        
        return -1
#     emotiveness.append(emotive)
# reviews["Emotiveness"] = pd.Series(emotiveness)

In [None]:
emotiveness = []
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analyse in enumerate(executor.map(emotion,reviews.tags)):
        print(i)
#         analyse+=1
#         print(analyse[0])
        emotiveness.append(analyse)
#     except:
#         emotiveness.append('-1')
    
reviews["Emotiveness"] = pd.Series(emotiveness)

In [None]:
temp = reviews.tags[0]
nouns = len([item[0] for item in temp if item[1][0] == 'N'])

In [None]:
reviews['PISR'].isnull().sum()

In [None]:
# pisr

def pisrs(review):
    count =0
    words = review.split()
    word_length = len(words)
    for term in categories_dict_liwc['pisr']:
        count+=len(re.findall(term,review))
    return count/word_length
#     pisr.append(count/len(review))
# reviews['PISR'] = pd.Series(pisr)

In [None]:
pisr = []
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analyse in enumerate(executor.map(pisrs,reviews.text_processed)):
#         print(i)
        pisr.append(analyse)
reviews['PISR'] = pd.Series(pisr)

In [None]:
scores =[]
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analyse in enumerate(executor.map(textstat.dale_chall_readability_score,reviews.text_processed)):
        scores.append(analyse)
reviews['Readability Score'] = pd.Series(scores)

In [None]:
subjectivity = []
polarity =[]
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analysis in enumerate(executor.map(TextBlob,reviews.text_processed)):
#         print(i)
        subjectivity.append(analysis.sentiment.subjectivity)
        polarity.append(analysis.sentiment.polarity)
                
reviews['Subjectivity'] = pd.Series(subjectivity)
reviews['Polarity'] = pd.Series(polarity)

In [None]:

# def harvard_analysis(review):
#     results =[]
#     results_avg = []
#     for category in categories_harvard:
# #         print(category)
# #         print()
#         count = 0
#         for term in categories_dict_harvard[category]:
# #             print(term)
#             words = re.findall(term,review)
#             count += len(words)
#         avg = count/len(review)
# #         reviews.loc[i,'length'] = len(review)
#         results.append(count)
#         results_avg.append(avg)
# #         opinion.loc[i,category] = count
# #         category_avg = str(category) + '_avg'
# #         opinion.loc[i,category_avg] = avg
#     return [results,results_avg];
    

In [None]:
# results = []
# results_avg = []
# with concurrent.futures.ProcessPoolExecutor() as executor:
#     for i,analysis in enumerate(executor.map(harvard_analysis,reviews.text_processed)):
#         print(i)
#         reviews.loc[i,categories_harvard]= analysis[0]
#         reviews.loc[i,categories_harvard_avg] = analysis[1]

In [None]:
def harvard_analysis(review,category):
#     print(categories_harvard[i])
    count = 0  
    words = review.split()
    word_length = len(words)
    for term in categories_dict_harvard[category]:
        words = re.findall(term,review)
        count += len(words)
    avg = count/word_length
    return [count,avg];

In [None]:
import functools
import itertools
executor = concurrent.futures.ProcessPoolExecutor()
for i in range(len(categories_harvard)):
    categori = categories_harvard[i]
    print(categories_harvard[i])
    series1 = []
    series2 = []
    for j,analyse in enumerate(executor.map(harvard_analysis,reviews.text_processed,itertools.repeat(categori))):
        if(j%10000==0):
            print(j)
        series1.append(analyse[0])
        series2.append(analyse[1])
    reviews[str(categories_harvard[i])] = series1
    reviews[str(categories_harvard[i]) +'_avg'] = series2
        

In [None]:
def liwc_analysis(review,category):
#     print(category)
    count = 0  
    words = review.split()
    word_length = len(words)
#     for term in categories_dict_liwc[categories_liwc[i]]:
    for term in categories_dict_liwc[category]:
        words = re.findall(term,review)
        count += len(words)
#         print(term)
    avg = count/word_length
    return [count,avg];

In [None]:
import functools
import itertools
executor = concurrent.futures.ProcessPoolExecutor()
for i in range(len(categories_liwc)):
    categori = categories_liwc[i]
#     print(i)
    print(categori)
    series1 = []
    series2 = []
    for j,analyse in enumerate(executor.map(liwc_analysis,reviews.text_processed,itertools.repeat(categori))):
        if(j%100000==0):
            print(j)
        series1.append(analyse[0])
        series2.append(analyse[1])
    reviews[str(categories_liwc[i])] = series1
    reviews[str(categories_liwc[i]) +'_avg'] = series2
        

In [None]:
reviews.to_csv('Final/reviews_analysed_combined.csv')

In [None]:
reviews.to_csv('reviews_analysed1.csv')

In [None]:
def individual_ref(review):
    count_our = 0
    count_self = 0
    count_our_self = 0
    count_our_self_you = 0
    words = review.split()
    word_length = len(words)
#     print(i)
    for term in categories_dict_harvard['Our']:
        count_our+= len(re.findall(term,review))

    for term in categories_dict_harvard['Self']:
        count_self += len(re.findall(term,review))

    for term in categories_dict_harvard['Our_Self']:
        count_our_self+= len(re.findall(term,review))
    for term in categories_dict_harvard['Our_Self_You']:
        count_our_self_you+= len(re.findall(term,review))

    if(count_our_self == 0):
        return [count_our/word_length,count_our_self_you/word_length,count_self/word_length,-1];
    else:
        return [count_our/word_length,count_our_self_you/word_length,count_self/word_length,count_self/count_our_self];
    

    

In [None]:
individual_reference = []
group_reference = []
narcissicism = []
self_ref = []

import concurrent.futures
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analyse in enumerate(executor.map(individual_ref,reviews.text_processed)):
        if(i%100000==0):
            print(i)
#         analyse+=1
#         print(analyse[0])
        individual_reference.append(analyse)
#         group_reference.append(analyse[1])
#         self_ref.append(analyse[2])
#         narcissicism.append(analyse[3])
        
        
        
reviews['Individual_Reference'] = pd.Series(individual_reference)
# reviews['Group_Reference'] = pd.Series(group_reference)
# reviews['Self_Reference'] = pd.Series(self_ref)
# reviews['Narcissism_Reference'] = pd.Series(narcissicism)


In [None]:
def individual_ref(review):
    count_our = 0
    count_self = 0
    count_our_self = 0
    count_our_self_you = 0
    words = review.split()
    word_length = len(words)
#     print(i)
    for term in categories_dict_harvard['Our']:
        count_our+= len(re.findall(term,review))
    count_ind = count_group
    count_narc = count_group
    for term in categories_dict_harvard['Self']:
        count_self += len(re.findall(term,review))
#     return count_our_self_you/word_length
    count_ind+= count_self
    count_narc+= count_self
    for term in categories_dict_harvard['Our_Self']:
        count_our_self+= len(re.findall(term,review))
    for term in categories_dict_harvard['Our_Self_You']:
        count_our_self_you+= len(re.findall(term,review))
    if(count_our_self == 0):
        return [count_our_self_you/word_length,count_self/word_length,-1];
    else:
        return [count_our_self_you/word_length,count_self/word_length,count_self/count_our_self];
    

In [None]:
individual_reference = []
group_reference = []
narcissicism = []
self_ref = []

import concurrent.futures
with concurrent.futures.ProcessPoolExecutor() as executor:
    for i,analyse in enumerate(executor.map(individual_ref,reviews.text_processed)):
        if(i%100000==0):
            print(i)
#         analyse+=1
#         print(analyse[0])
        individual_reference.append(analyse[0])
        group_reference.append(analyse[1])
        self_ref.append(analyse[2])
        narcissicism.append(analyse[3])
        
        
        
reviews['Individual_Reference'] = pd.Series(individual_reference)
reviews['Group_Reference'] = pd.Series(group_reference)
reviews['Self_Reference'] = pd.Series(self_ref)
reviews['Narcissism_Reference'] = pd.Series(narcissicism)


In [None]:
reviews['Group_Reference'] = pd.Series(group_reference)

In [None]:
reviews.to_csv('Final/reviews_analysed_narcissism.csv')

In [None]:
reviews = reviews.drop(['lang',],axis=1)

In [None]:
reviews.head()

In [None]:
reviews.columns

In [None]:
reviews[categories_harvard]