In [1]:
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('/Users/balbi/Downloads/pyMorfologik-master'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pymorfologik import Morfologik
from pymorfologik.parsing import ListParser
from pymorfologik.parsing import BaseParser
from pymorfologik.parsing import DictParser

import numpy as np
from __future__ import print_function
import string
from multiprocessing import cpu_count, Pool
import dask.dataframe as dd
from dask.multiprocessing import get
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
def getEmotionStatusOfText(result):
    if(result < 0):
        return 'negative'
    elif(result > 0):
        return 'positive'
    else:
        return 'neutral'

In [3]:
def lemmatisation(text):
    parser = ListParser()
    stemmer = Morfologik()
    stemming = stemmer.stem([text], parser)
    words_list = list()
    for s in stemming:
        for i in s:
            for l in i:
                if(len(l) > 1):
                    words_list.append(l)
    return words_list

In [4]:
def lemmatisation_dictParser(text):
    parser = DictParser()
    stemmer = Morfologik()
    stemming = stemmer.stem([text], parser)
    words_list = list()
    for key, val in stemming.items():
        words_list.append(val[0])
    return words_list

In [16]:
emotions = pd.read_csv('emotionsTable-with-neutralEmotion.csv')
def detectingEmotionsInText(words_list):
    numberOfWords = len(words_list)
    numberOfWordsFound = 0
    result = 0
    if(numberOfWords > 0):
        emotionsCounter = 0
        for el in words_list: 
            if((emotions['word'] == el)).any():
                print(el)
                emotionsCounter += float(emotions.loc[emotions['word'] == el]['sentiment'].mean())  
                numberOfWordsFound = numberOfWordsFound + 1
        result = emotionsCounter/numberOfWords
    return [result, numberOfWordsFound]

In [8]:
%%time
import sys
import re
reload(sys)
sys.setdefaultencoding('utf-8')

result = pd.read_csv('mediaKrytyk-commentsRates.csv')
result = result.reset_index()

def calculateOneRow(index,comment, emotion, rate):
    lemmatisation_text = re.sub("[!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']", '', comment)
    [result, number] = detectingEmotionsInText(lemmatisation_text)
    result_emotion = getEmotionStatusOfText(result)
    return {"text": comment, 'words': len(comment.split()), 'wordsFound': number, "resultOfDetecting": result, "detectedEmotion": result_emotion, 'emotion': emotion, 'rate': rate}

def parallelize(data, func):
    print (partitions)
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    mapResult = pool.map(func, data_split)
    data = pd.concat(mapResult)
    pool.close()
    pool.join()
    return data

cores = cpu_count()
partitions = cores

ddata = dd.from_pandas(result, npartitions=partitions)

result = ddata.map_partitions(lambda df: df.apply((lambda row: calculateOneRow(*row)), axis=1)).compute(get=get)
result_df = pd.DataFrame()
for o in result:
    result_df = result_df.append(o, ignore_index=True)
    
result_df.to_csv('mediaKrytyk-comments-analysis-dictParser-withoutLem.csv', encoding='utf-8', index=False) 

In [None]:
result_df['correct_guess'] = result_df['detectedEmotion'] == result_df['emotion']

In [None]:
result_df.groupby('correct_guess').size()

In [None]:
def changeRangeOfComments(maximumNegativeRate, mimimumPositiveRate, comments):
    comments = comments[(comments.rate <= maximumNegativeRate) | (comments.rate >= mimimumPositiveRate)]
    comments = comments.reset_index()
    return comments

In [None]:
#tylko z ocena 1 i 10
results = changeRangeOfComments(1, 10, result_df)

In [None]:
results.groupby('correct_guess').size()

In [None]:
def printResults(result_df):
    print('Number of correct guess: ')
    print(len(result_df.loc[result_df['correct_guess'] == True]))
    print('Number of wrong guess: ')
    print(len(result_df.loc[result_df['correct_guess'] == False]))

In [None]:
def get_length(text):
    return len(text)

def get_length_bin(length):
    return round(length / 10)

def get_score(result):
    return 1 if result else 0

def prepereDataForPlot(result_df):
    result_df['correct_guess'] = result_df['detectedEmotion'] == result_df['emotion']
    result_df['score'] = map(get_score, result_df['correct_guess'])
    #create group with 10 elements
    result_df['text_length_bin'] = map(get_length_bin, result_df['words'])    
    return result_df

In [None]:
def createPlotTextLength(result_df):
    # data to plot
    n_groups = round(result_df['text_length_bin'].max())
    
    grouped = result_df.groupby('text_length_bin', as_index=False).mean()
    sum_elements = grouped.words
    means_words = grouped.words/sum_elements
    means_lemmatisation = grouped.wordsAfterLemmatisation/sum_elements
    means_emotions = grouped.wordsFound/sum_elements

    # create plot
    fig, ax = plt.subplots()
    index = [0., 1., 2., 3., 4., 5., 6., 7.]
    bar_width = 0.35
    opacity = 0.5

    rects1 = plt.bar(index, means_words, bar_width,
                 alpha=opacity,
                 color='b')

    rects2 = plt.bar(index, means_lemmatisation, bar_width,
                 alpha=opacity,
                 color='g')
    
    rects2 = plt.bar(index, means_emotions, bar_width,
                 alpha=opacity,
                 color='c')

    plt.xlabel('Lenght of text')
    plt.ylabel('Percent')
    #plt.title('Scores by person')
    plt.xticks(index, ('10', '20', '30', '40', '50', '60', '70', '80'))
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
result_df = prepereDataForPlot(result_df)

In [None]:
createPlotTextLength(result_df)

In [None]:
grouped = result_df.groupby('text_length_bin', as_index=False).mean() 

In [None]:
grouped

In [None]:
def createPlotRates(result_df):
    # data to plot
    result_df = result_df.round({'rate': 0})
    n_groups = result_df.rate.max()
    
    grouped = result_df.groupby('rate', as_index=False).mean()
    means_words = grouped.words/grouped.words
    means_lemmatisation = grouped.wordsAfterLemmatisation/grouped.words
    means_emotions = grouped.wordsFound/grouped.words

    # create plot
    fig, ax = plt.subplots()
    index = [0., 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]
    bar_width = 0.35
    opacity = 0.5

    rects1 = plt.bar(index, means_words, bar_width,
                 alpha=opacity,
                 color='b')

    rects2 = plt.bar(index, means_lemmatisation, bar_width,
                 alpha=opacity,
                 color='g')
    
    rects2 = plt.bar(index, means_emotions, bar_width,
                 alpha=opacity,
                 color='c')

    plt.xlabel('Rates')
    plt.ylabel('Percent')
    #plt.title('Scores by person')
    plt.xticks(index, ('1', '2', '3', '4', '5', '6', '7', '8', '9', '10'))
    plt.legend()

    plt.tight_layout()
    plt.show()

In [None]:
createPlotRates(result_df)

In [None]:
result_rates = result_df.round({'rate': 0})
grouped = result_rates.groupby('rate', as_index=False).mean()

In [None]:
#subst - rzeczownik, adj - przymiotnik, verb - czasownik, nom - mianownik, inf - bezokolicznik

In [None]:
def devide(number):
    return number/2

def moreThenHalf(result_df):
    result_df['half_wordsAfterLemmatisation'] = map(devide, result_df['wordsAfterLemmatisation'])
    result_df['moreThenHalf'] = 0
    result_df['moreThenHalf'] = np.where(((result_df['wordsFound'] >= result_df['half_wordsAfterLemmatisation']) & (result_df['half_wordsAfterLemmatisation'] > 0) & (result_df['wordsFound'] > 0)), 1 , 0)
    return result_df

In [None]:
tym = moreThenHalf(result_df)
tym.groupby('moreThenHalf').size()

In [None]:
review_less = tym.loc[tym['moreThenHalf'] == 1]
review_less['correct_guess'] = review_less['detectedEmotion'] == review_less['emotion']
review_less.groupby('correct_guess').size()

In [9]:
detectingEmotionsInText('Adam McKay zrealizował film swojego życia. Najambitniejszy, imponujący tempem, jednak bardzo przystępny i zrozumiały nawet dla takiego laika jak ja.')

[-0.10130718954248366, 24]

In [10]:
text = 'Adam McKay zrealizował film swojego życia. Najambitniejszy, imponujący tempem, jednak bardzo przystępny i zrozumiały nawet dla takiego laika jak ja.'

In [14]:
len(text.split())

20

In [12]:
text = re.sub("[!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']", '', text)

In [17]:
detectingEmotionsInText(text)

[-0.1040268456375839, 24]