In [128]:
import pandas as pd
import re
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import pymorphy2
from operator import itemgetter
nltk.download('vader_lexicon')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\12\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\12\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\12\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [129]:
# get: series of text to preprocess
# return series of preprocessed text
def preprocess(s):
    vader = SentimentIntensityAnalyzer()
    lemmatizer = WordNetLemmatizer()
    stop_words = set(stopwords.words('english'))
    ret = pd.Series(index=s.index)
    for i in s.index:
        reply = s[i]
        # remove account including
        reply = re.sub(r'@[\w]+', ' ', reply)
        # remove links
        reply = re.sub(r'https?:[\w./]*', ' ', reply)
        reply = reply.lower()
        # remove all non-words
        reply = re.sub(r'[^a-z]', ' ', reply)        
        # lemmatization
        prepared = [lemmatizer.lemmatize(word) for word in reply.split()
                    if lemmatizer.lemmatize(word) not in stop_words]
        ret[i] = ' '.join(prepared)
    return ret


#get: dataframe of preprocessed text
#return dataframe with two new columns: class and score
def getClass(df):
    #ret = pd.Series(index=s.index)
    for i in df.index:
        text = df['preprocessed'][i]
        res = vader.polarity_scores(text)
        # there is useless in our problem
        del res['compound']
        # get key (pos, neu or neg) with max value
        df.loc[i, 'class'] = max(res.items(), key=itemgetter(1))[0]
        df.loc[i, 'score'] = max(res.values())
    return df
        
df = pd.read_excel('data.xlsx')
df = df.drop(df.columns[0:8], axis=1)
df['preprocessed'] = preprocess(df['text'])
df = getClass(df)
df.to_excel('result.xlsx')
df[df['score'] != 0].head(15)

Unnamed: 0,text,preprocessed,class,score
0,@Microsoft All d more reasons why @Microsoft i...,reason better place work,neu,0.508
1,@Microsoft @NCCEducation Great idea,great idea,pos,0.804
3,@Microsoft Wow! That's nice!,wow nice,pos,1.0
4,@Microsoft Microsoft - If not already any chan...,microsoft already chance could make office ava...,neu,0.833
6,@Microsoft üëè people over profit,people profit,pos,0.744
7,@Microsoft Guilt trip,guilt trip,neg,0.677
8,@Microsoft Well Done to you Microsoft for this...,well done microsoft initiative please life ret...,neu,0.44
9,@Microsoft In this really hard situation you s...,really hard situation make office program free...,neu,0.546
10,@Microsoft Something seriously wrong with @Mic...,something seriously wrong tier two support thr...,neu,0.483
11,@Microsoft Then why aren't vendors who are qua...,vendor quarantined sick getting paid seems sil...,neu,0.577


# –û–ø–∏—Å–∞–Ω–∏–µ –∏ –¥—Ä—É–≥–∏–µ –≤–æ–∑–º–æ–∂–Ω–æ—Å—Ç–∏
–î–ª—è –ø–æ–ª—É—á–µ–Ω–∏—è –∫–ª–∞—Å—Å–æ–≤ —è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª VADER, —Ç.–∫. —ç—Ç–∞ –±–∏–±–ª–∏–æ—Ç–µ–∫–∞ –∑–∞—Ç–æ—á–µ–Ω–∞ –¥–ª—è –æ–±—Ä–∞–±–æ—Ç–∫–∏ —Ç–µ–∫—Å—Ç–æ–≤ –∏–∑ —Å–æ—Ü–∏–∞–ª—å–Ω—ã—Ö —Å–µ—Ç–µ–π. –ö–∞–∫–∏–µ –µ—â—ë –µ—Å—Ç—å –≤–∞—Ä–∏–∞–Ω—Ç—ã?
* TextBlob. –ü–æ–∂–∞–ª—É–π, —Å–∞–º–∞—è –ø—Ä–æ—Å—Ç–∞—è –≤–µ—â—å, –≤—Å—Ç—Ä–æ–µ–Ω–∞ –≤ nltk. –ù–æ –ø–æ –∫–∞—á–µ—Å—Ç–≤—É —á—É—Ç—å —Ö—É–∂–µ VADER'–∞
* FastText - –æ—Ç–ª–∏—á–Ω–∞—è –±–∏–±–ª–∏–æ—Ç–µ–∫–∞ –¥–ª—è –≥–µ–Ω–µ—Ä–∞—Ü–∏–∏ word2vec. –û–¥–Ω–∞–∫–æ —Ç—Ä–µ–±—É–µ—Ç –æ–±—É—á–µ–Ω–∏—è, —Ç.–µ. —Ä—É—á–Ω–æ–π –ø–æ—Å—Ç–∞–Ω–æ–≤–∫–∏ –º–µ—Ç–æ–∫ –Ω–∞ —Ç–µ–∫—Å—Ç–∞—Ö. –í —Å–∏–ª—É –º–∞–ª–æ–≥–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤–∞ –≤—Ä–µ–º–µ–Ω–∏ –Ω–µ –ø–æ–¥—Ö–æ–¥–∏—Ç.
–ï—Å–ª–∏ –±—ã –≤—Ä–µ–º–µ–Ω–∏ –±—ã–ª–æ –±–æ–ª—å—à–µ, —è –±—ã –ø–æ–ø—Ä–æ–±–æ–≤–∞–ª –æ–±—É—á–∏—Ç—å —Å–µ—Ç–æ—á–∫—É –Ω–∞ —É–∂–µ –º–∞—Ä–∫–∏—Ä–æ–≤–∞–Ω–Ω—ã—Ö –æ—Ç–∫—Ä—ã—Ç—ã—Ö –Ω–∞–±–æ—Ä–∞—Ö —Ç–µ–∫—Å—Ç–æ–≤.
# Workflow
–í –ø—Ä–∏–Ω—Ü–∏–ø–µ, –æ–Ω –≤–∏–¥–µ–Ω –ø–æ —Ñ—É–Ω–∫—Ü–∏—è–º –≤—ã—à–µ. –ö–æ—Ä–æ—Ç–∫–æ –ø–µ—Ä–µ—á–∏—Å–ª–∏–º –æ—Å–Ω–æ–≤–Ω—ã–µ –º–æ–º–µ–Ω—Ç—ã:
* –°–∫–∞—á–∏–≤–∞–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö. –ö —Å–æ–∂–∞–ª–µ–Ω–∏—é, API –¢–≤–∏—Ç—Ç–µ—Ä–∞ –¥–æ—Å—Ç—É–ø–Ω–æ —Ç–æ–ª—å–∫–æ –ø–æ—Å–ª–µ —Ä–µ–≥–∏—Å—Ç—Ä–∞—Ü–∏–∏ –ø—Ä–∏–ª–æ–∂–µ–Ω–∏—è. –Ø –æ—Å—Ç–∞–≤–∏–ª –∑–∞—è–≤–∫—É, –Ω–æ –æ–Ω–∏ —Ä–∞—Å—Å–º–∞—Ç—Ä–∏–≤–∞—é—Ç—Å—è –≤ —Ä—É—á–Ω–æ–º —Ä–µ–∂–∏–º–µ –≤ —Ç–µ—á–µ–Ω–∏–µ 2-3 –¥–Ω–µ–π. –ü–æ—ç—Ç–æ–º—É —è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª —Å—Ç–æ—Ä–æ–Ω–Ω–∏–µ —Ä–µ—à–µ–Ω–∏—è –¥–ª—è —Å–∫–∞—á–∏–≤–∞–Ω–∏—è –æ—Ç–≤–µ—Ç–æ–≤ –Ω–∞ —Ç–≤–∏—Ç—ã, –æ–¥–Ω–∞–∫–æ —ç—Ç–∏ —Ä–µ—à–µ–Ω–∏—è –∏–º–µ—é—Ç —Å–∏–ª—å–Ω—ã–µ –æ–≥—Ä–∞–Ω–∏—á–µ–Ω–∏—è. –í –ø—Ä–∏–Ω—Ü–∏–ø–µ, —Ç.–∫. —è –∏—Å–ø–æ–ª—å–∑–æ–≤–∞–ª –≥–æ—Ç–æ–≤—É—é –º–æ–¥–µ–ª—å, –∞ –Ω–µ –æ–±—É—á–∞–ª —á—Ç–æ-—Ç–æ, –±–æ–ª—å—à–æ–µ –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ –¥–∞–Ω–Ω—ã—Ö –∏ –Ω–µ –Ω—É–∂–Ω–æ.
* –ü—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞. –£–¥–∞–ª–µ–Ω–∏–µ –Ω–∏–∫–æ–≤, —Å—Å—ã–ª–æ–∫, –ø–µ—Ä–µ–≤–æ–¥ –≤ –Ω–∏–∂–Ω–∏–π —Ä–µ–≥–∏—Å—Ç—Ä
* –õ–µ–º–º–∞—Ç–∏–∑–∞—Ü–∏—è. –° –∞–Ω–≥–ª–∏–π—Å–∫–∏–º —è–∑—ã–∫–æ–º —ç—Ç–æ –Ω–µ—Å–ª–æ–∂–Ω–æ
* –£–¥–∞–ª–µ–Ω–∏–µ —Å—Ç–æ–ø-—Å–ª–æ–≤: –æ–Ω–∏ –Ω–µ–π—Ç—Ä–∞–ª—å–Ω—ã –ø–æ —Å–º—ã—Å–ª—É, –Ω–æ –¥–æ–±–∞–≤–ª—é—è—Ç —à—É–º–∞.
* –ü–æ–ª—É—á–µ–Ω–∏—è –∫–ª–∞—Å—Å–∞
