In [1]:
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('/Users/balbi/Downloads/pyMorfologik-master'))
if module_path not in sys.path:
    sys.path.append(module_path)
from pymorfologik import Morfologik
from pymorfologik.parsing import ListParser
import numpy as np
from __future__ import print_function
import string
from multiprocessing import cpu_count, Pool
import dask.dataframe as dd
from dask.multiprocessing import get
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
def lemmatisation(text):
    parser = ListParser()
    stemmer = Morfologik()
    stemming = stemmer.stem([text], parser)
    words_list = list()
    for s in stemming:
        for i in s:
            for l in i:
                if(len(l) > 1):
                    words_list.append(l)
    return words_list

In [3]:
emotions = pd.read_csv('emotionsTable-with-neutralEmotion.csv')

In [4]:
def detectingEmotionsInText(words_list):
    numberOfWords = len(words_list)
    numberOfWordsFound = 0
    result = 0
    if(numberOfWords > 0):
        emotionsCounter = 0
        for el in words_list: 
            if((emotions['word'] == el)).any():
                emotionsCounter += float(emotions.loc[emotions['word'] == el]['sentiment'].mean())  
                numberOfWordsFound = numberOfWordsFound + 1
        result = emotionsCounter/numberOfWords
    return [result, numberOfWordsFound]

In [5]:
# toleracja 0.01 -> od -0.01 do 0.01 jest neutralny
# od 0.01 jest pozytywny
# od -0.01 jest negarywny

def getEmotionStatusOfText(result):
    if(result < 0):
        return 'negative'
    elif(result > 0):
        return 'positive'
    else:
        return 'neutral'

In [None]:
result = detectingEmotionsInText('Uważam że zmuszanie dzieci do jedzenia warzyw jest karygodne oraz bestialskie')
print(getEmotionStatusOfText(result))

In [None]:
comments = pd.read_csv('oceny.csv')
numberOfComments = len(comments)
mean = comments['Ocena'].mean()

In [None]:
%%time
comments_list = comments["Komentarz"].values
counter = 0
newCommentsTable = pd.DataFrame()
for comment in comments_list:
    test = detectingEmotionsInText(comment)
    newCommentsTable = newCommentsTable.append({'detectingEmotions': lemmatisation(comment), 'counter': test, 'result': getEmotionStatusOfText(test)}, ignore_index=True)

In [None]:
table = comments.join(newCommentsTable)
#table.to_csv('FilmWeb-detectingEmotions.csv', encoding='utf-8', index=False)

In [None]:
def prepereDataWithoutLemmatisation(maximumNegativeRate, mimimumPositiveRate):
    comments = pd.read_csv('FilmWeb-commentsRates.csv')
    comments.loc[comments.emotion == 'neutral', 'emotion'] = "negative"
    comments = comments[(comments.rate <= maximumNegativeRate) | (comments.rate >= mimimumPositiveRate)]
    comments = comments.reset_index()
    charToRemove= dict.fromkeys("!@#$'?.:,")
    result_df['Text'] = result_df['Text'].translate(charToRemove)
    return comments.head(10)

In [None]:
prepereDataWithoutLemmatisation(1,10)

In [None]:
%%time
data = prepereDataWithoutLemmatisation()
print ('dataset: ' + str(len(data)) + ' objects')
newCommentsTable = pd.DataFrame()
for index, comment in data.iterrows():
    text = ' '.join(comment['Text'].split())
    number = detectingEmotionsInText(text)
    emotion = getEmotionStatusOfText(number)
    newCommentsTable = newCommentsTable.append({"text": comment['Text'], "detectingEmotion": emotion, 'emotion': comment['emotion']}, ignore_index=True)

In [6]:
data1 = pd.read_csv('FilmWeb-commentsRates.csv')
data2 = pd.read_csv('FilmWeb-commentsRates-moreMovies.csv')
result = data1.append(data2)

result = result.drop_duplicates(['comment'], keep='last')
result = result.reset_index()

In [9]:
result
result = result.drop('level_0', 1)

In [10]:
%%time
import sys
import re
reload(sys)
sys.setdefaultencoding('utf-8')

def calculateOneRow(index,comment, emotion, rate):
    lemmatisation_text = re.sub("[!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']", '', comment)
    words_list = lemmatisation(lemmatisation_text)
    [result, number] = detectingEmotionsInText(words_list)
    result_emotion = getEmotionStatusOfText(result)
    return {"text": comment, 'words': len(comment.split()), "lemmatisation": words_list, 'wordsAfterLemmatisation': len(words_list), 'wordsFound': number, "resultOfDetecting": result, "detectedEmotion": result_emotion, 'emotion': emotion, 'rate': rate}

def parallelize(data, func):
    print (partitions)
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    mapResult = pool.map(func, data_split)
    data = pd.concat(mapResult)
    pool.close()
    pool.join()
    return data

cores = cpu_count()
partitions = cores

ddata = dd.from_pandas(result, npartitions=partitions)

result = ddata.map_partitions(lambda df: df.apply((lambda row: calculateOneRow(*row)), axis=1)).compute(get=get)
result_df = pd.DataFrame()
for o in result:
    result_df = result_df.append(o, ignore_index=True)
    
result_df.to_csv('FilmWeb-comments-analysis-neutralemotion-results-3.csv', encoding='utf-8', index=False)    

KeyboardInterrupt: 

In [None]:
def get_length(text):
    return len(text)

def get_length_bin(length):
    return length / 10

def get_score(result):
    return 1 if result else 0

result_df.loc[result_df.detectingEmotion == 'neutral', 'detectingEmotion'] = "negative"
result_df['correct_guess'] = result_df['detectingEmotion'] == result_df['emotion']
result_df['score'] = map(get_score, result_df['correct_guess'])
result_df['text_length'] = map(get_length, result_df['text'])
result_df['text_length_bin'] = map(get_length_bin, result_df['text_length'])
grouped = result_df.groupby('text_length_bin', as_index=False)['score'].mean()

fig, ax = plt.subplots()
x = grouped.text_length_bin * 10
y = grouped.score
fit = np.polyfit(x, y, deg=1)
ax.plot(x, fit[0] * x + fit[1], color='red')
ax.scatter(x, y)

fig.show()

In [None]:
print('Number of correct guess: ')
print(len(result_df.loc[result_df['correct_guess'] == True]))
print('Number of wrong guess: ')
print(len(result_df.loc[result_df['correct_guess'] == False]))

In [None]:
len(result_df.loc[result_df['correct_guess'] == True])

In [None]:
len(result_df.loc[result_df['correct_guess'] == False])

In [None]:
len(result_df)

In [None]:
words_list = lemmatisation('Idę na spacer teraz')

In [None]:
words_list

In [None]:
def get_length(text):
    return len(text)

def get_length_bin(length):
    return length / 10

def get_score(result):
    return 1 if result else 0

def renderPlot(result_df):
    result_df.loc[result_df.detectedEmotion == 'neutral', 'detectedEmotion'] = "negative"
    result_df['correct_guess'] = result_df['detectedEmotion'] == result_df['emotion']
    result_df['score'] = map(get_score, result_df['correct_guess'])
    result_df['text_length'] = map(get_length, result_df['text'])
    result_df['text_length_bin'] = map(get_length_bin, result_df['text_length'])
    grouped = result_df.groupby('text_length_bin', as_index=False)['score'].mean()

    fig, ax = plt.subplots()
    x = grouped.text_length_bin * 10
    y = grouped.score
    fit = np.polyfit(x, y, deg=1)
    ax.plot(x, fit[0] * x + fit[1], color='red')
    ax.scatter(x, y)

    fig.show()

In [None]:
%%time
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

def calculateOneRow(index, emotion, rate, text):
    [result, number] = detectingEmotionsInText(text)
    result_emotion = getEmotionStatusOfText(result)
    return {"text": text, "detectedEmotion": result_emotion, 'emotion': emotion, 'wordsFound:' number}

def parallelize(data, func):
    print (partitions)
    data_split = np.array_split(data, partitions)
    pool = Pool(cores)
    mapResult = pool.map(func, data_split)
    data = pd.concat(mapResult)
    pool.close()
    pool.join()
    return data

cores = cpu_count()
partitions = cores

data = prepereDataWithoutLemmatisation(1,10)
ddata = dd.from_pandas(data, npartitions=partitions)


result = ddata.map_partitions(lambda df: df.apply((lambda row: calculateOneRow(*row)), axis=1)).compute(get=get)
result_df = pd.DataFrame()
for o in result:
    result_df = result_df.append(o, ignore_index=True)

In [None]:
data

In [None]:
renderPlot(result_df)

In [None]:
result_df.to_csv('FilmWeb-reviews-analysis.csv', encoding='utf-8', index=False)

In [None]:
print ('Number of correct guess: ' + str(len(result_df.loc[result_df['correct_guess'] == True])))
print ('Number of wrong guess: ' + str(len(result_df.loc[result_df['correct_guess'] == False])))

In [None]:
len(result_df.loc[result_df['correct_guess'] == True])

In [None]:
len(result_df.loc[result_df['correct_guess'] == False])

In [None]:
text = "Ja tam wolą Zmierzch. ale film 50 twarzy nie był tragiczny. Zmierzch i 50 twarzy Grey'a. Tragiczny! Ale fajnie było, kolorowy, idę"

In [None]:
parser = ListParser()
stemmer = Morfologik()
stemming = stemmer.stem([text], parser)
words_list = list()
for s in stemming:
    for i in s:
        for l in i:
            if(len(l) > 1):
                words_list.append(l)


In [None]:
stemming

In [None]:
lemmatisation(text)

In [None]:
charToRemove = string.punctuation

In [None]:
charToRemove

In [None]:
import re

line = re.sub("[!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~']", '', text)

In [None]:
line

In [8]:
result

Unnamed: 0,level_0,index,comment,emotion,rate
0,0,0,Strzał w kolano dla wszystkich facetów ...,positive,6
1,1,1,Śmierdzące gówno w srebrnym celofaniku... ...,negative,2
2,2,2,Całkowite zaskoczenie O...,positive,6
3,3,3,"""Boże, zaraz się zrzygam"".. ...",negative,1
4,4,4,"Jak ""Zmierzch"" nie wiem...",positive,8
5,5,6,Może gdyby ktoś próbował uratować ten film prz...,negative,2
6,6,7,"Film dla ""ajfonowych"" lemingów z ujemnym IQ. ...",negative,1
7,7,8,"""Porno"" dla ubogich (mieszczek) ...",negative,4
8,8,9,Słabo Gra aktorska na p...,negative,3
9,9,10,To najbardziej antyfeministyczna rzecz w dziej...,negative,1
