In [22]:
# Getting the data
import pandas as pd
import numpy as np
data=pd.read_csv('Web/MafiaGameReview/FullData.csv')
del(data[data.columns.values[0]])
data=data[data['Language']=='E']

In [23]:
# Get the list of most important English words in the reviews
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,max_features=30,stop_words='english')
tfidf = tfidf_vectorizer.fit_transform(data['Text'])
mat=tfidf.todense()

colValues=tfidf_vectorizer.get_feature_names()
vectorizedData=pd.DataFrame(columns=colValues)

for i in range(0,len(mat)):
    vectorizedData=vectorizedData.append(pd.DataFrame(np.array(mat[i]),columns=colValues))
data=data.reset_index()
vectorizedData=vectorizedData.reset_index()
data['bad']=vectorizedData['bad']
data['good']=vectorizedData['good']

In [None]:
# Bad Data
dataBad=data[data['bad']>0.3]

# We will now do a TFIDF on this data set which will represent the bad parts ( word cloud sort of )
tfidf_badVectorizer = TfidfVectorizer(max_df=0.95, min_df=2,max_features=50,stop_words='english')
tfidf_bad = tfidf_badVectorizer.fit_transform(dataBad['Text'])
matBad=tfidf.todense()
colValuesBad=tfidf_badVectorizer.get_feature_names()
colValuesBad.remove('great')
colValuesBad.remove('good')
colValuesBad.remove('better')
colValuesBad.remove('game')
colValuesBad.remove('make')
colValuesBad.remove('say')
colValuesBad.remove('ve')
colValuesBad.remove('including')

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
wordcloud = WordCloud(width = 1000, height = 500).generate(' '.join(colValuesBad))
plt.figure(figsize=(15,8))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [24]:
# Phrase Analysis
#data['Text'].head(10)

# Find all bi grams containing a specific word
wordVal='good'
wordSentiment=pd.read_csv('/home/anantgupta/Documents/Web/MafiaGameReview/SentiWord/SentiWordNet.txt',sep='\t')
wordSentiment['SynsetTerms']=wordSentiment['SynsetTerms'].map(lambda x:x[0:x.find('#')])

In [105]:
sentenceList=data['Text'].ix[1].split('.')

# We will write a function to create bigram list from each sentence
def getBiGrams(bigramList,prevWord,sent):
    # We will have to do some POS Tagging also
    sentencePosTags=nltk.pos_tag(nltk.word_tokenize(sent))
    for curWord in sentencePosTags:
        if(prevWord != ''):
            bigramList.append((prevWord,curWord))
            prevWord=curWord
        if(prevWord == ''):
            prevWord=curWord

# We will loop through all ther bigrams and allot a score based on WordNet Sentiment
import math
def getBiGramScore((x,y)):
    if(x[1] in ['NN','NNS','NNP','NNPS']):
        posScore1=wordSentiment[(wordSentiment['# POS']=='n') & (wordSentiment['SynsetTerms']==x[0].lower())]['PosScore'].mean()
        negScore1=wordSentiment[(wordSentiment['# POS']=='n') & (wordSentiment['SynsetTerms']==x[0].lower())]['NegScore'].mean()
    elif(x[1] in ['RB','RBR','RBS']):
        posScore1=wordSentiment[(wordSentiment['# POS']=='a') & (wordSentiment['SynsetTerms']==x[0].lower())]['PosScore'].mean()
        negScore1=wordSentiment[(wordSentiment['# POS']=='a') & (wordSentiment['SynsetTerms']==x[0].lower())]['NegScore'].mean()
    elif(x[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']):
        posScore1=wordSentiment[(wordSentiment['# POS']=='v') & (wordSentiment['SynsetTerms']==x[0].lower())]['PosScore'].mean()
        negScore1=wordSentiment[(wordSentiment['# POS']=='v') & (wordSentiment['SynsetTerms']==x[0].lower())]['NegScore'].mean()
    else:
        posScore1=wordSentiment[(wordSentiment['SynsetTerms']==x[0].lower())]['PosScore'].mean()
        negScore1=wordSentiment[(wordSentiment['SynsetTerms']==x[0].lower())]['PosScore'].mean()
        
    if(y[1] in ['NN','NNS','NNP','NNPS']):
        posScore2=wordSentiment[(wordSentiment['# POS']=='n') & (wordSentiment['SynsetTerms']==y[0].lower())]['PosScore'].mean()
        negScore2=wordSentiment[(wordSentiment['# POS']=='n') & (wordSentiment['SynsetTerms']==y[0].lower())]['NegScore'].mean()
    elif(y[1] in ['RB','RBR','RBS']):
        posScore2=wordSentiment[(wordSentiment['# POS']=='a') & (wordSentiment['SynsetTerms']==y[0].lower())]['PosScore'].mean()
        negScore2=wordSentiment[(wordSentiment['# POS']=='a') & (wordSentiment['SynsetTerms']==y[0].lower())]['NegScore'].mean()
    elif(y[1] in ['VB','VBD','VBG','VBN','VBP','VBZ']):
        posScore2=wordSentiment[(wordSentiment['# POS']=='v') & (wordSentiment['SynsetTerms']==y[0].lower())]['PosScore'].mean()
        negScore2=wordSentiment[(wordSentiment['# POS']=='v') & (wordSentiment['SynsetTerms']==y[0].lower())]['NegScore'].mean()
    else:
        posScore2=wordSentiment[(wordSentiment['SynsetTerms']==y[0].lower())]['PosScore'].mean()
        negScore2=wordSentiment[(wordSentiment['SynsetTerms']==y[0].lower())]['PosScore'].mean()
    if(math.isnan(posScore1)):
        posScore1=wordSentiment[wordSentiment['SynsetTerms']==x[0].lower()]['PosScore'].mean()
        negScore1=wordSentiment[wordSentiment['SynsetTerms']==x[0].lower()]['NegScore'].mean()
    if(math.isnan(posScore2)):
        posScore2=wordSentiment[wordSentiment['SynsetTerms']==y[0].lower()]['PosScore'].mean()
        negScore2=wordSentiment[wordSentiment['SynsetTerms']==y[0].lower()]['NegScore'].mean()                         
    if(math.isnan(posScore1)):
        posScore1=0
        negScore1=0
    if(math.isnan(posScore2)):
        posScore2=0
        negScore2=0
    posScore=posScore1 + posScore2
    negScore=negScore1 + negScore2
    #print("{}={} {}={}  {} {}".format(x[0],x[1],y[0],y[1],posScore1,posScore2))
    return(posScore - negScore)

def getScore(sentence):
    prevWord=''
    bigramList=[]
    #[getBiGrams(bigramList,prevWord,curSentence) for curSentence in sentenceList]
    getBiGrams(bigramList,prevWord,sentence)
    bigramScore=[getBiGramScore((x,y)) for (x,y) in bigramList]
    return(sum(bigramScore))

data['SentencesScorreList']=data['Text'].apply()

[getScore(sentence) for sentence in sentenceList]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,


In [99]:
#bigramList
#wordSentiment[wordSentiment['SynsetTerms']=='for']
#bigramList[0][0][1]
#'NN' in ['NN','NNS','NNP','NNPS']
bigramScore
#wordSentiment[(wordSentiment['# POS']=='a') & (wordSentiment['SynsetTerms']==bigramList[0][0][0].lower())]['PosScore'].mean()
#import nltk
#abc=nltk.pos_tag(nltk.word_tokenize(data['Text'][1]))
#data['Text'][1]
#'Even'.lower()
#wordSentiment[(wordSentiment['PosScore'] >0) & (wordSentiment['NegScore'] > 0)]
#bigramList[0][0]
#wordSentiment[wordSentiment['SynsetTerms']]

[0.3125,
 0.0,
 0.0,
 0,
 0,
 0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.875,
 0.875,
 0.0,
 0.0,
 0.0,
 0,
 0,
 0,
 0,
 0,
 0.0,
 0.0,
 0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0,
 0,
 0,
 0,
 0.0625,
 0.0625,
 0.039473684210526314,
 0.039473684210526314,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.125,
 0.0,
 0.125,
 0.125,
 0,
 0.0,
 0.0,
 0,
 0.0,
 0.0,
 0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.020833333333333332,
 0.020833333333333332,
 0.0,
 0.0,
 0,
 -0.025,
 -0.025,
 0,
 0.0,
 0.0,
 0,
 0,
 0.5833333333333334,
 0.5833333333333334,
 0,
 0,
 0.0,
 0.0,
 0,
 0,
 0,
 0,
 0.0,
 0.0,
 0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.1346153846153846,
 0.1346153846153846,
 0,
 0.0,
 0.0,
 0.0,
 0.0,
 -0.025000000000000022,
 -0.025,
 0,
 0.0]