To-do:
1. Текст был очищен только от одного мусорного элемента в качестве примера. Исслудйте данные через ноутбук или чере веб-интерфейс BigQuery на предмет других мусорных элементов в тексте, которые не несут в себе никакого особого смысла, а только создают шум в данных. Доработайте функцию очистки тектосвых данных, чтобы в нее можно было передать список ненужного мусора и разом выполнялась очистка
2. Проведите стратифицировнную кросс-валидуцию нейросетевого классификатора https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html
3. Поэксперементируйте с гиперпараметрами нейросетевого классификатора, постарайтесь повысить качество его работы
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
4. Попробуйте использовать не Word2Vec для получения векторого представления текста, а TF-IDF преобразование http://zabaykin.ru/?p=558 http://nlpx.net/archives/57
5. Попробуйте использовать более тонко настриваемые алгоритмы нейросетей, например из этого видео https://www.youtube.com/watch?v=cPkH1k3U1c8 

In [28]:
import warnings
warnings.filterwarnings('ignore')

from google.oauth2 import service_account
import pandas_gbq 

import numpy as np
import pandas as pd
import math as mt
import datetime as dt

from langdetect import detect
import re

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report,confusion_matrix
from gensim.models.word2vec import Word2Vec
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_curve,auc
import matplotlib.pyplot as plt

In [29]:
import dotenv
dotenv.load_dotenv(dotenv.find_dotenv('.env'))

CREDENTIALS = service_account.Credentials.from_service_account_info({
    "type": os.getenv("TYPE"),
    "project_id": os.getenv("PROJECT_ID"),
    "private_key_id": os.getenv("PRIVATE_KEY_ID"),
    "private_key": os.getenv("PRIVATE_KEY"),
    "client_email": os.getenv("CLIENT_EMAIL"),
    "client_id": os.getenv("CLIENT_ID"),
    "auth_uri": os.getenv("AUTH_URI"),
    "token_uri": os.getenv("TOKEN_URI"),
    "auth_provider_x509_cert_url": os.getenv("AUTH_PROVIDER_X509_CERT_URL"),
    "client_x509_cert_url": os.getenv("CLIENT_X509_CERT_URL"),
})

In [30]:
#funtion for getting fresh data from DWH for workload model
"""[summary]
Funtion for getting fresh data from BigQuery for workload scoring model
[description]
Credentials - google service account object with credentials data for project
[example]
Input: Credentials = credentials_object
Output: description	                                        channel	 category	category_flag
        \nChat transcript:\nVisitor: I want to buy wit...	chat	 ps	        1
        \nChat transcript:\nVisitor: hell i had a prob...	chat	 ps	        1
        \nChat transcript:\nVisitor: لا استطيع الشراء ...	 chat	  ps	     1
"""
def getDwhData(Credentials):
    statement_bigquery_sql = " ".join(["select description, channel, case",
                                       "when manual_category in ('payment_problem','how_to_pay','howtopay','how_to_play','paystation_error','ps_problem','ps_declined') then 'ps'",
                                       "else 'other'",
                                       "end as category,",
                                       "case",
                                       "when manual_category in ('payment_problem','how_to_pay','howtopay','how_to_play','paystation_error','ps_problem','ps_declined') then 0",
                                       "else 1",
                                       "end as category_flag",
                                       "from `xsolla_summer_school.customer_support`",
                                       "where manual_category is not null and",
                                       "manual_category <> '' and",
                                       "description is not null and",
                                       "description <> '' and",
                                       "channel is not null and",
                                       "channel <> '' and",
                                       "channel in ('chat','facebook')"])
    
    dataframe_bigquery = pandas_gbq.read_gbq(statement_bigquery_sql,project_id='findcsystem', credentials=Credentials, dialect='standard')

    return dataframe_bigquery


"""[summary]
Function for transform text to lower case
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = ["Text_1","Text_2"]
Output: ["text_1","text_2"]
"""
def lowerCase(Corpus):
    corpus = [i.lower().replace('\n',' ') for i in Corpus]
    return corpus


"""[summary]
Function for getting language of text
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = ["Text_1","Text_2"]
Output: ["en","ru"]
"""
def getTextLanguage(Corpus):
    txt_lang = []
    for txt in Corpus:
        try:
            lang = detect(txt)
        except Exception as e:
            lang = 'error'
        finally:
            txt_lang.append(lang)
    
    return txt_lang


"""[summary]
Function for tokenization text
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = ["word1 word2","word3 word4"]
Output: [["word1","word2"],["word3","word4"]]
"""  
def textToTokens(Corpus):
    corpus = [i.split() for i in Corpus]
    return corpus 


"""[summary]
Function for clear text after garbage
[description]
Corpus - list or array object, with text data
Substr - string, regular expression
[example]
Input: Corpus = [["word1","word2"],["word3","word4"]]
       Substr = r'word1
Output: [["word2"],["word3","word4"]]
"""  
def clearTextAfterGarbage(Corpus,Substr):
    return [[word for word in text if Substr not in word] for text in Corpus]


"""[summary]
Function for replace urls
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = [["http://some.url.ru/","word2"],["word3","word4"]]
Output: [["HTTP", "word2"],["word3","word4"]]
"""  
def clearTextFromUrls(Corpus):
    return [[re.sub("(^https?:\/\/.*)|(.*\.com.*)", "HTTPADDRESS", word) for word in text] for text in Corpus]


"""[summary]
Function for clear text from short words
[description]
Corpus - list or array object, with text data
WordLength - int, length having which word will be delete
[example]
Input: Corpus = [["word1","word1234"],["word123","word1234"]]
       WordLength = 6
Output: [["word1234"],["word123","word1234"]]
"""  
def clearShortWordsAsGarbage(Corpus, WordLength):
    return [[word for word in text if len(word) > WordLength] for text in Corpus]


"""[summary]
Function for clear from non-english text
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = [["word","table1"],["table","word1234"]]
Output: [["word"],["table"]]
"""  
def clearNonEnglish(Corpus):
    EN_ALPHABET = "qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM"
    return [[word for word in text if all(char in EN_ALPHABET for char in word)] for text in Corpus]


"""[summary]
Function for clear words from punctuation
[description]
Corpus - list or array object, with text data
[example]
Input: Corpus = [["word,","table!"],["table..","word1234"]]
Output: [["word,","table"],["table","word1234"]]
"""  
def clearPunctuation(Corpus):
    return [[re.sub(r"[^a-zA-Z]*", "", word) for word in text if re.search("[a-zA-Z]", word)] for text in Corpus]


"""[summary]
Build word vector by using pre-trained Word2Vec model
[description]
Size - lenght of vector
Word2Vec_Model - gensim object
"""  
def buildWordVector(Text,Size,Word2Vec_Model):
    vec = np.zeros(Size).reshape((1,Size))
    count = 0.

    for word in Text:
        try:
            vec += Word2Vec_Model[word].reshape((1,Size))
            count += 1.
        except KeyError:
            continue
    
    if count != 0:
        vec /= count
    
    return vec

RAWDATA 

In [31]:
import os
#getting data from dwh
if os.path.isfile('neuro.csv'):
    SupportRawDataframe = pd.read_csv('neuro.csv')
else:
    SupportRawDataframe = getDwhData(CREDENTIALS)
    SupportRawDataframe.to_csv('neuro.csv')
SupportRawDataframe.shape

(23450, 5)

DATA PREPROC

In [32]:
#transform text to lower case
corpus = SupportRawDataframe.description
corpus.astype('str')

corpus = lowerCase(corpus)

In [7]:
#getting language for text corpus
corpus_lang = getTextLanguage(corpus)

In [33]:
#new dataframe with texts in lower case, without /n symbol and with lang for text
SupportRawDataframe['description'] = corpus
SupportRawDataframe['lang'] = corpus_lang

In [34]:
#getting only en texts
SupportDataframe_eng = SupportRawDataframe[SupportRawDataframe.lang == 'en'][:]
SupportDataframe_eng

Unnamed: 0.1,Unnamed: 0,description,channel,category,category_flag,lang
0,0,chat transcript: visitor: i want to buy with ...,chat,ps,0,en
1,1,chat transcript: visitor: hell i had a proble...,chat,ps,0,en
2,2,chat transcript: visitor: لا استطيع الشراء وم...,chat,ps,0,en
3,3,chat transcript: visitor: im having trouble w...,chat,ps,0,en
4,4,chat transcript: visitor: hi ana: hello. how ...,chat,ps,0,en
...,...,...,...,...,...,...
23445,23445,"chat transcript: visitor: hi, i made a prucha...",chat,other,1,en
23446,23446,"chat transcript: visitor: hi, how long will i...",chat,other,1,en
23447,23447,chat transcript: visitor: i bought playerunkn...,chat,other,1,en
23448,23448,chat transcript: visitor: good day i took the...,chat,other,1,en


In [35]:
#text tekenization
tokenization = textToTokens(SupportDataframe_eng.description)
SupportDataframe_eng['description'] = tokenization

In [36]:
SupportDataframe_eng

Unnamed: 0.1,Unnamed: 0,description,channel,category,category_flag,lang
0,0,"[chat, transcript:, visitor:, i, want, to, buy...",chat,ps,0,en
1,1,"[chat, transcript:, visitor:, hell, i, had, a,...",chat,ps,0,en
2,2,"[chat, transcript:, visitor:, لا, استطيع, الشر...",chat,ps,0,en
3,3,"[chat, transcript:, visitor:, im, having, trou...",chat,ps,0,en
4,4,"[chat, transcript:, visitor:, hi, ana:, hello....",chat,ps,0,en
...,...,...,...,...,...,...
23445,23445,"[chat, transcript:, visitor:, hi,, i, made, a,...",chat,other,1,en
23446,23446,"[chat, transcript:, visitor:, hi,, how, long, ...",chat,other,1,en
23447,23447,"[chat, transcript:, visitor:, i, bought, playe...",chat,other,1,en
23448,23448,"[chat, transcript:, visitor:, good, day, i, to...",chat,other,1,en


In [12]:
texts = list(SupportDataframe_eng.description)

In [14]:
"""[summary]
Attemp to automatize garbage selection
[description]
garbage - list of strings, init words to delete
texts - list of strings, main texts
"""  
def train_function(garbage, texts):
    tests_clear = texts
    tests_clear = clearTextFromUrls(tests_clear)
    tests_clear = clearPunctuation(tests_clear)
    tests_clear = clearShortWordsAsGarbage(tests_clear, 3)
    for garbage_word in garbage:
        tests_clear = clearTextAfterGarbage(tests_clear, garbage_word)

    SupportDataframe_eng['description'] = tests_clear

    #list of unique categories
    unique_categories = np.unique(SupportDataframe_eng.category)
    descriptions = SupportDataframe_eng['description']
    categories = SupportDataframe_eng['category_flag']
    XTrain,XTest,YTrain,YTest = train_test_split(descriptions,
                                                 categories,
                                                 stratify = categories,
                                                 test_size = 0.2,
                                                 random_state = 40)

    #initialize Word2Vec model for embedding words to vectors
    NDim = 100
    Imdb_w2v = Word2Vec(size = NDim,min_count = 10)
    Imdb_w2v.build_vocab(XTrain)

    Imdb_w2v.train(XTrain,total_examples = Imdb_w2v.corpus_count,epochs = Imdb_w2v.epochs)

    #embedding training messages to vectors for neutral classifier
    TrainVecs = np.concatenate([buildWordVector(i,NDim,Imdb_w2v) for i in XTrain])

    Imdb_w2v.train(XTest, total_examples = Imdb_w2v.corpus_count, epochs = Imdb_w2v.epochs)
    TestVecs = np.concatenate([buildWordVector(i,NDim,Imdb_w2v) for i in XTest])

    TextClassifier = MLPClassifier(hidden_layer_sizes = (20,10), max_iter = 1000, random_state = 40)

    Scores = cross_val_score(TextClassifier, TrainVecs, YTrain, cv = 5, n_jobs=-1)
    
    result = {}
    
    result["mean_score"] = np.mean(Scores)
    
    TextClassifier.fit(TrainVecs,YTrain)

    pred = TextClassifier.predict(TestVecs)
    result["conf_matrix"] = confusion_matrix(YTest,pred)
    result["class_report"] = classification_report(YTest,pred)
    
    # select word correctly and incorrectly classified    
    tp_index = YTest[(YTest == 1) & (pred == 1)].index
    tn_index = YTest[(YTest == 0) & (pred == 0)].index
    fn_index = YTest[(YTest == 1) & (pred == 0)].index
    fp_index = YTest[(YTest == 0) & (pred == 1)].index

    # count word amount in each group
    bags = {
        "tp": (tp_index, {}),
        "tn": (tn_index, {}),
        "fp": (fp_index, {}),
        "fn": (fn_index, {}),
    }
    
    for key, (index, _dict) in bags.items():
        for sent in XTest[index]:
            for word in sent:
                if word not in _dict:
                    _dict[word] = 0
                _dict[word] += 1

    # select top 10 words which cause false-positive result 
    fp_words_sorted = [word[0] for word in sorted(bags["fp"][1].items(), key=lambda x: -x[1])]
    
    words_to_delete = set()
    count_words_to_delete = 10
    curr_count = 0
    
    for word in fp_words_sorted:
        if word not in bags["tn"][1]:
            words_to_delete.add(word)
            curr_count += 1
        if curr_count == count_words_to_delete:
            break
    
    
    # select top 10 words which cause false-negative result 
    fn_words_sorted = [word[0] for word in sorted(bags["fn"][1].items(), key=lambda x: -x[1])]
    
    words_to_keep_in = set()
    count_words_to_keep_in = 10
    curr_count = 0
    
    for word in fn_words_sorted:
        if word not in bags["tp"][1]:
            words_to_keep_in.add(word)
            curr_count += 1
        if curr_count == count_words_to_keep_in:
            break
    
    # include in garbage fp-words and exclude fn-words
    garbage |= words_to_delete
    garbage -= words_to_keep_in
    
    return garbage, result  

In [15]:
# WARNING! Executes for an hour+
# WARNING! Executes for an hour+
# WARNING! Executes for an hour+

# to delete from text
garbage = {'chat', 'transcript', 'visitor', } 

history = []

for i in range(20):
    print(f"No: {i}")
    garbage, result = train_function(garbage, texts)
    history.append((garbage, result))


In [353]:
print(history[-1][1]['class_report'])

              precision    recall  f1-score   support

           0       0.27      0.52      0.35       635
           1       0.84      0.65      0.73      2561

    accuracy                           0.62      3196
   macro avg       0.56      0.58      0.54      3196
weighted avg       0.73      0.62      0.66      3196



In [349]:
max([i[1]["class_report"].split('\n')[2].split()[3] for i in history])

'0.60'

### Резульат 2+ часов: все больше мусорных слов выбирается и к концу падает точность

TRAIN TEST SPLIT DATAFRAMES

In [16]:
#list of unique categories
unique_categories = np.unique(SupportDataframe_eng.category)
unique_categories

array(['other', 'ps'], dtype=object)

In [17]:
descriptions = SupportDataframe_eng['description']
categories = SupportDataframe_eng['category_flag']
XTrain,XTest,YTrain,YTest = train_test_split(descriptions,
                                             categories,
                                             stratify = categories,
                                             test_size = 0.2,
                                             random_state = 40)

FROM TEXTS TO VECTORS

In [18]:
#initialize Word2Vec model for embedding words to vectors
NDim = 100
Imdb_w2v = Word2Vec(size = NDim,min_count = 10)
Imdb_w2v.build_vocab(XTrain)

In [19]:
Imdb_w2v.train(XTrain,total_examples = Imdb_w2v.corpus_count,epochs = Imdb_w2v.epochs)

(8546025, 12531845)

In [20]:
#embedding training messages to vectors for neutral classifier
TrainVecs = np.concatenate([buildWordVector(i,NDim,Imdb_w2v) for i in XTrain])

In [21]:
TrainVecs

array([[ 0.50773859,  0.4288903 , -0.40971846, ..., -0.67132317,
         0.92387551, -0.16837773],
       [ 0.34783289,  0.43015106, -0.08246623, ..., -0.89399285,
         1.21067714, -0.4401564 ],
       [ 0.56721134,  0.19724233, -0.13991912, ..., -0.22544182,
         0.14983015,  0.17726802],
       ...,
       [ 0.58387975,  0.39541373, -0.01756927, ..., -0.74149539,
         0.71712793,  0.15895999],
       [ 0.89165027,  0.51317344, -0.06418519, ..., -0.5327883 ,
         0.57747167,  0.08593015],
       [ 0.28309082,  0.15320466, -0.34378317, ..., -0.76837949,
         0.74710338, -0.15036911]])

In [22]:
Imdb_w2v.train(XTest, total_examples = Imdb_w2v.corpus_count, epochs = Imdb_w2v.epochs)

(2100264, 3086620)

In [23]:
TestVecs = np.concatenate([buildWordVector(i,NDim,Imdb_w2v) for i in XTest])

CROSSVALIDATION AND BUILD CLASSIFIER

In [24]:
TextClassifier = MLPClassifier(hidden_layer_sizes = (20,10), max_iter = 1000, random_state = 40)

In [26]:
TextClassifier.fit(TrainVecs,YTrain)

MLPClassifier(hidden_layer_sizes=(20, 10), max_iter=1000, random_state=40)

In [27]:
pred = TextClassifier.predict(TestVecs)
print(confusion_matrix(YTest,pred))
print(classification_report(YTest,pred))

[[ 477  160]
 [ 238 2326]]
              precision    recall  f1-score   support

           0       0.67      0.75      0.71       637
           1       0.94      0.91      0.92      2564

    accuracy                           0.88      3201
   macro avg       0.80      0.83      0.81      3201
weighted avg       0.88      0.88      0.88      3201



In [175]:
# TextClassifier = MLPClassifier(hidden_layer_sizes = (1,), 
#                                learning_rate_init=0.01,
#                                max_iter = 15, 
#                                random_state = 40)
# >> scores 5 folds: [0.9, 0.91, 0.9, 0.91, 0.91]
# >> mean: 0.906
#
# Mark: enougth 1 neuron with 15 iterations to reach high accuracy (1% lower than max reached by me) with nice prec/recll/f1

TextClassifier = MLPClassifier(hidden_layer_sizes = (9, 3),
                               alpha=0.2,
                               learning_rate_init=0.002,
                               max_iter = 130, 
                               random_state = 40)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)
skf.get_n_splits(TrainVecs, YTrain)

folding_reports = []

for train_index, test_index in skf.split(TrainVecs, YTrain):
    XTrain_fold = TrainVecs[train_index]
    YTrain_fold = YTrain.iloc[train_index]
    
    Xtest_fold = TrainVecs[test_index]
    Ytest_fold = YTrain.iloc[test_index]
    
    TextClassifier.fit(XTrain_fold, YTrain_fold)
    fold_pred = TextClassifier.predict(Xtest_fold)
    
    folding_reports.append(classification_report(Ytest_fold, fold_pred))
    
    
    report = classification_report(Ytest_fold, fold_pred)
    print('\n'.join([row for i, row in enumerate(report.split('\n')) if i in [2,3,5] ]), '\n')

Scores = [float(report.split('\n')[5].split()[1]) for report in folding_reports]
# Scores = cross_val_score(TextClassifier, TrainVecs, YTrain, cv = 5, n_jobs=-1)
print(Scores)
print(np.mean(Scores))


           0       0.81      0.69      0.75       510
           1       0.93      0.96      0.94      2051
    accuracy                           0.91      2561 

           0       0.80      0.77      0.78       510
           1       0.94      0.95      0.95      2051
    accuracy                           0.92      2561 

           0       0.82      0.71      0.76       510
           1       0.93      0.96      0.95      2050
    accuracy                           0.91      2560 

           0       0.77      0.75      0.76       509
           1       0.94      0.95      0.94      2051
    accuracy                           0.91      2560 

           0       0.80      0.77      0.79       509
           1       0.94      0.95      0.95      2051
    accuracy                           0.92      2560 

[0.91, 0.92, 0.91, 0.91, 0.92]
0.914
