In [None]:
## for data
import json
import pandas as pd
import numpy as np

In [3]:
## language recognition
from langdetect import detect

In [193]:
from googletrans import Translator

In [4]:
## for processing
import re
import nltk

In [5]:
# for mutilang lemmatization
import simplemma

In [6]:
# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection, metrics,svm
## for word embedding
import gensim
import gensim.downloader as gensim_api

In [194]:
translator = Translator()

In [524]:
# данные из jira
data_jira = pd.read_csv('data_from_jira.csv')
data_jira.shape

(10965, 2)

In [391]:
data_jira.head()

Unnamed: 0,text,company
0,{html} **From:** E.Amez@achenbach.de \r\n **S...,ACH
1,{html} **From:** /O=EXCHANGELABS/OU=EXCHANGE A...,ACH
2,{html} **From:** /O=EXCHANGELABS/OU=EXCHANGE A...,ACH
3,{html} **From:** /O=EXCHANGELABS/OU=EXCHANGE A...,ACH
4,{html} **From:** E.Amez@achenbach.de \r\n **S...,ACH


In [419]:
data_jira.shape

(10945, 2)

In [525]:
# удаление строк с проектами demo, integrate
data_jira = data_jira[(data_jira.company!="DEMO") & (data_jira.company!="INTEGRATE")]

In [526]:
data_jira.shape

(9865, 2)

In [529]:
text_langs = list(map(lambda x:detect(x)=="en" ,data_jira.text))

In [530]:
data_jira_not_en = data_jira[~np.array(text_langs)]

In [531]:
data_jira_not_en["company"].reset_index().groupby("company").count().sort_values(by="index")

Unnamed: 0_level_0,index
company,Unnamed: 1_level_1
IKA,1
CCZTOZNX,1
CHDLTOSNX,1
CSCH,1
MB,1
...,...
TBN,33
IMA,36
VOS,36
TURCK,79


In [604]:
# удаление строк не с английским языком
data_jira_en = data_jira[text_langs]

In [533]:
# распределение компаний
n_companies_sorted = data_jira_en["company"].reset_index().groupby("company").count().sort_values(by="index")

In [606]:
# удаление строк с 1 значениями
data_jira_en = data_jira_en[data_jira_en["company"].map(lambda x: n_companies_sorted.loc[x][0] > 1)]

In [607]:
data_jira_en.shape

(8590, 2)

In [608]:
data_jira_en["company"].reset_index().groupby("company").count().sort_values(by="index")

Unnamed: 0_level_0,index
company,Unnamed: 1_level_1
ZSYS,2
KVH,2
SPAL,2
MED,2
EIS,2
...,...
EPDM,242
SVC,283
ECTR,334
SIE,393


In [609]:
clean_data = data_jira_en.copy()

In [610]:
ps = nltk.stem.porter.PorterStemmer()
lem = nltk.stem.wordnet.WordNetLemmatizer()
# препроцессинг текста
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    # текст переводится в нижний регистр, оставляются только буквы
    text = re.sub('[^A-Za-z ]+', '', text.lower().strip())
    
    # токенизация - разбиение на слова
    lst_text = text.split()
    
    # удаление стоп слов
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
    
    # стемминг
    if flg_stemm:
        
        lst_text = [ps.stem(word) for word in lst_text]
         
    # леммантизация
    if flg_lemm:
        
        lst_text = [lem.lemmatize(word) for word in lst_text]
      
    
            
    # перевод обратно в текст
    text = " ".join(lst_text)
    return text

In [611]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [551]:
len(lst_stopwords)

179

In [612]:
data_jira_en["text_clean"] = data_jira_en["text"].apply(lambda x: 
          preprocess_text(x, flg_stemm=True, flg_lemm=False, 
          lst_stopwords=lst_stopwords))

In [613]:
data_jira_en["company"].reset_index().groupby("company").count().sort_values(by="index")

Unnamed: 0_level_0,index
company,Unnamed: 1_level_1
ZSYS,2
KVH,2
SPAL,2
MED,2
EIS,2
...,...
EPDM,242
SVC,283
ECTR,334
SIE,393


In [614]:
# кол-во записей определенного класса
data_jira_en["company"].reset_index().groupby("company").count().sort_values(by="index").loc["DM"]

index    518
Name: DM, dtype: int64

In [615]:
# компании с письмами только не на английском
set(data_jira_not_en["company"]) -set(data_jira_en["company"]) 

{'AGILEEDM',
 'AMSC',
 'APL',
 'AQ',
 'ARRIVAL',
 'CCZTOZNX',
 'EXDA',
 'LECTRA',
 'LUK',
 'MYR',
 'RLAB',
 'ROE',
 'RUS',
 'ZZZINTERNAL'}

In [616]:
## split dataset
dtf_train, dtf_test = model_selection.train_test_split(data_jira_en, test_size=0.33,shuffle=True,stratify=data_jira_en["company"])
## get target
y_train = dtf_train["company"].values
y_test = dtf_test["company"].values

In [617]:
## Count (classic BoW)
vectorizer_bow = feature_extraction.text.CountVectorizer(max_features=10000, ngram_range=(1,2))

In [618]:
## Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

In [619]:
corpus = dtf_train["text_clean"]

In [620]:
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [621]:
X_train.shape

(5755, 10000)

In [622]:
corpus

4353    html kbakgrundfoscom sent supportecad subject ...
7968    exactli follow situat new project modul havea ...
2786    html oxplmsolutiongmbhexorgouexchang administr...
6843    html oexchangelabsouexchang administr groupfyd...
7740    html oxplmsolutiongmbhexorgouexchang administr...
                              ...                        
6095    html pavel sozonovski send andreklavehnlaempec...
5090    cae consult martina walter mailtomartinawalter...
9615    html stefanweberthyssenkruppcom sent nikolaeva...
282     html oexchangelabsouexchang administr groupfyd...
2759    html andreaskoppithermofishercom sent lychev m...
Name: text_clean, Length: 5755, dtype: object

In [623]:
X_names[::-1]

['zzzm',
 'subject zzzm',
 'npdi report',
 'hour',
 'npdi',
 'pm walter',
 'sv integr',
 'martina cc',
 'administrativegroupcnrecipientscndmitrykirillov send',
 'assanbaev',
 'deni assanbaev',
 'sv',
 'work hour',
 'patrick',
 'deni',
 'googl',
 'assanbaev subject',
 'lidia lychev',
 'wiethan dorothe',
 'ml',
 'denisassanbaevrdsoftwarecom',
 'spent',
 'dorothe cc',
 'attachedbest',
 'xxx',
 'walter martina',
 'enter',
 'id id',
 'id',
 'zsa',
 'subject zsa',
 'radmin',
 'dongl',
 'ip',
 'caegroup subject',
 'caegroup',
 'usb',
 'cae vpn',
 'licens valid',
 'pc',
 'registergericht freiburg',
 'server',
 'maier',
 'get order',
 'dmitryfor',
 'vrdsecadteam',
 'updat version',
 'book',
 'groupfydibohfspdltcnrecipientscncbcacfafdcccefeffdmitrykiri sent',
 'eplan integr',
 'nikolaeva subject',
 'config',
 'zeppelin',
 'zep',
 'wiederhold',
 'harder',
 'eplan xplm',
 'consol',
 'hide',
 'cintegrexelibintegratestartuprbin',
 'zw datei',
 'betreff eplan',
 'timei',
 'pdf',
 'document use',
 'th

In [624]:
# отбор признаков для уменьшения размерности матрицы с помощью хи-квадрат

In [625]:
y = dtf_train["company"]
X_names = vectorizer.get_feature_names()
p_value_limit = 0.0001
dtf_features = pd.DataFrame()
for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    dtf_features = dtf_features.append(pd.DataFrame(
                   {"feature":X_names, "score":p, "y":cat}))
    dtf_features = dtf_features.sort_values(["y","score"], 
                    ascending=[True,False])
    dtf_features = dtf_features[dtf_features["score"]<p_value_limit]
X_names = dtf_features["feature"].unique().tolist()

In [626]:
X_train

<5755x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 445208 stored elements in Compressed Sparse Row format>

In [627]:
len(X_names)

5164

In [628]:
for cat in np.unique(y):
    print("# {}:".format(cat))
    print("  . selected features:",
         len(dtf_features[dtf_features["y"]==cat]))
    print("  . top features:", ",".join(
dtf_features[dtf_features["y"]==cat]["feature"].values))
    print(" ")

# ACH:
  . selected features: 13
  . top features: send apr,profil hello,talk mr,pdf ausgab,pdffile,fwd aw,eingab,doc inno,metadaten,quantiti bom,subject ach,ach,achenbach
 
# ACTIA:
  . selected features: 32
  . top features: advancebest,xplm connector,cad doc,orcad ci,tsd,innov,connector hi,ac,core,item bom,gener,part cad,bom plm,ara sp,issu xplm,pcba,subject plm,supportbest,cad document,cad,core design,gaudel,gaudel jeanmichel,exporttyp,pong,johan pong,html johanpongeactiafr,johanpongeactiafr sent,johan,johanpongeactiafr,subject actia,actia
 
# ADMINUI:
  . selected features: 13
  . top features: popup,thedocu,buckhout cc,error error,editor,send david,ui,beta,log see,result get,error report,admin,env
 
# ADS:
  . selected features: 192
  . top features: destroy,lychev rdsoftwar,gmbh,synchron part,mail tomartinawaltercaeconsultcom,tomartinawaltercaeconsultcom web,updat librari,issu synchron,sent montag,montag april,sie,geschftsfhrereugen,geschftsfhrereugen kienzler,receiv email,infor

  . selected features: 24
  . top features: sml,slave,enthlt vertraulich,subassembl,unbefugt,wire,telefonisch,inform contain,email mr,bom structur,im profil,dmitri rupietta,elektrokonstrukt,elektrisch,elcad,mang,martin,regard martin,martin zeh,mr zeh,zeh martin,subject asi,zeh,asi
 
# BAH:
  . selected features: 25
  . top features: answer within,dayth,dayth intern,issuessystemat,issuessystemat send,new issuessystemat,within dayth,support cc,geht,dump avail,datasheet,pm support,singl part,gregoryliokumovich,octob pm,plugin,neuer,elcad,maschinenbau,profil server,string articleattribut,articleattribut,subject bah,bahmuel,bah
 
# BDT:
  . selected features: 16
  . top features: cpm,cadenc,err,sr,bereich,friday may,mailtomartinawaltercaeconsultcoms,walter mailtomartinawaltercaeconsultcoms,dem ftp,pcaprojectno,sergej,vassiliadi tryfon,vassiliadi,tryfon,subject bdt,bdt
 
# BEC:
  . selected features: 20
  . top features: maxim kirillov,file zip,problem relat,find problem,zip archiv,relat fil

  . top features: ag co,istemp,support dm,dm sp,pba draw,telecom,lescow,control,xplm solut,diod,drger vorschairm,vorschairm,dxdatabook entri,hilsen,venlig,venlig hilsen,alexei kirillov,isnew,isnew istemp,recipi pleas,regardsmaxim,hen krohn,agil plm,note draeger,regardsmaxim lychev,subbiah,subbiah venkatesh,confidenti inform,librarymerg,sebastian jemelka,cc ritter,volkmar subject,subject drger,venkatesh,tool,sent maxim,lychev xplm,hello maximw,lychev maxim,hen lebedev,uat,sheet,html andreasjemelkadraegercom,contain confidenti,norbert norbertzimmermanndraegercom,subject merger,ddraegerintegratedocinterfaceexelibintegratestartuprbin,senden,med,aw dm,projectdraeg,sollten sie,sebastian lebedev,schlicht peter,begemann marcu,andreasjemelkadraegercom sent,interfac,ihrem,sebastian cc,sync interfac,vertraulich informationen,dm projectdraeg,projectdraeg medic,cc zimmermann,agil,volkmarhussdraegercom cc,andov,hilsen volkmar,med venlig,regard med,dm integr,aufsichtsratschairman,aufsichtsratschairma

  . top features: sven,januari pm,rau subject,sent tuesday,variabl set,tuesday januari,liokumovich sent,storag area,pmto,gregori liokumovich,hello gregori,consult dorothe,liokumovich,cc sven,liokumovich cc,manag rdsoftwar,karlsruhegermani,karlsruhegermani offic,pdf log,liokumovich project,tuesday may,offic st,fax offic,dump best,regard gregori,storag,gregoryliokumovichrdsoftwarecom,gregori,httpwwwrdsoftwarecom,web page,httpwwwrdsoftwarecom origin,gmbhgablonzerstr,gmbhgablonzerstr karlsruhegermani,gregoryliokumovichrdsoftwarecom web,petersburg,st petersburg,page httpwwwrdsoftwarecom,dorothe wiethanmailtodorotheewiethancaeconsultcom,wiethanmailtodorotheewiethancaeconsultcom,wiethanmailtodorotheewiethancaeconsultcom sent,vista,font,pm gregori,subject ewz,ewz,anypdf
 
# FAMECC:
  . selected features: 16
  . top features: engin base,perform test,project plan,uom,access test,qualiti,xplm connector,classif hello,copi content,simmat subject,file server,eb,proto,marco,soap,antonio
 
# FESTO:
  

  . selected features: 17
  . top features: cycl,tuesday octob,prototyp,may von,eco,admincaeconsultcom alexej,liokumovich cc,board fabric,dmitrykirillov subject,sentthursday,integr agil,build data,run integr,problem run,integr cadenc,hp,thoon
 
# HUI:
  . selected features: 52
  . top features: cabinet,simmat cc,groet,vriendelijk groet,groet kind,met,partslist,zw file,netherlandsphon,report hi,last modifi,hyperlink,email attach,electr part,met vriendelijk,vriendelijk,modifi date,hh,wtpart,content email,locat,equip,regard met,max,ron,kind regardsmax,regardsmax,janssen,janssen thoma,ozen,ozen mehmet,schemk,schemk max,danni,subject hui,suijker danni,mehmet,html mschemkeshuismannlcom,suijker,mschemkeshuismannlcom sent,huisman,mschemkeshuismannlcom,html rhalvehuismannlcom,aad,vonk aad,richard,vonk,rhalvehuismannlcom sent,hui,rhalvehuismannlcom,halv richard,halv
 
# HWEBER:
  . selected features: 17
  . top features: itshould,nikolaeva dmitri,html renatosimmatxplmcom,renatosimmatxplmcom sent

  . top features: herstel,canyou,send kirillov,new item,true,marc hutsch,encount problem,dmitryi tri,problem instal,plm,vmware,soap,write,xplmordermanag,wert,result get,sdelatj,itemrevis,lizenzen,vpn access
 
# MANZ:
  . selected features: 26
  . top features: regardschristophvon,part classif,part attribut,procad,zusammen,frank,class attribut,belong,aktuel,christoph rupietta,document id,lieferung,hier,testen,gre,im profil,changenumb,struktur,gre best,regardsia,stringempti,ndf,best regardsia,fmarkertmanzcom,subject manz,manz
 
# MAREL:
  . selected features: 27
  . top features: robert,bom tab,upgrad eplan,attach pleas,tag,plm eda,merg plm,bom creation,integr connector,initenvbat,releas block,christoph rupietta,remov follow,send christoph,eplan problem,van,harri,found contentfilt,marelhttpcaeserverrdsbrowsemarel,contentfilt,bipecm,michel van,neerven,van neerven,michel,subject marel,marel
 
# MATRIX:
  . selected features: 16
  . top features: christophpleas find,lebedev send,find docume

  . selected features: 58
  . top features: dbg,russia email,pl,nado,digit,geometri,liokumovich,syntax,send kirillov,emerson,edif,rau simmat,origin design,sven ive,montag februar,arc,boctor,director ute,lorenzstra,ute,ute boctor,vatidno,vatidno de,stop error,shape,log,intern error,log creat,snx,ustidnr vatidno,pars,log log,arc locat,field radiu,geometri shape,line skip,radiu,radiu defin,reason mandatori,shape arc,skip reason,fatal,liokumovich sent,fatal error,sch file,ascii,log error,filelog,warn log,error log,log compil,log warn,log fatal,pars sch,ftl,bnl,log protelsnx,protelsnx
 
# PRV:
  . selected features: 10
  . top features: forc,that,georg,comparison,support request,compar,subject support,behav,document name,preview
 
# PSB:
  . selected features: 24
  . top features: rather,monday juli,let us,get inform,pm admincaeconsultcom,think make,fe,model,wouldb,bit differ,prototyp,monday septemb,pc hello,pleas commun,impress,hardcod,difficult,gave,focu,regard alexej,learn,alexej origin,

  . top features: updat modul,need anoth,eplan,access right,modellog updat,attributesdescriptionst,attributesdescriptionst pruefer,bezeichnungmsest,bezeichnungmsest bezeichnungst,bezeichnungst descriptionmsest,descriptionmsest,descriptionmsest itemst,item attributesdescriptionst,mslog updat,pruefer bezeichnungmsest,attach instal,mslog search,pruefer,upgrad eplan,herstel,docmachin log,document docmachin,foundlog look,docmachin,itemst foundlog,plmlog itemst,look itemst,itemst mslog,mslog,eplan hello,gerhard,itemst,witzel,dax,subject sor,sontag,sor,sortimat
 
# SPAL:
  . selected features: 24
  . top features: andrea kirillov,hear,writer,michel,refd,mario,bare,subject demo,look forward,partnumb,column,bare board,cam,gerber file,access system,pick place,requir pleas,pick,mount,print,core design,list requir,comma,dpdf
 
# SPRP:
  . selected features: 37
  . top features: design engin,ecad,plc,bare,david,bom issu,problem instal,hi dmitrypleas,issu hi,look forward,mcad,administr groupfydibohf

  . top features: kg,de inhalt,partassysmd,connectionlog remov,benachrichtigen,bitt benachrichtigen,log refd,read articl,plmclient,wichtig hinwei,finish log,propertieslog read,executionlog,modul executionlog,articl propertieslog,mit,fabric,co,registergericht regist,build project,empfnger bestimmt,close connectionlog,connectionlog,clean modul,regard html,benachrichtigen sie,michael betreff,hallo,executionlog close,refd tp,freundlichen,eciinsrelerr,regard ia,distributor,bestimmt sollten,court amtsgericht,den empfnger,ein kenntnisnahm,ge,ge werner,geschtzt oder,haft ge,hinwei dies,nutzung de,sonstig nutzung,iserlohn,board fabric,haben ist,hinwei,contractorslog,contractorslog read,read site,site contractorslog,docbestueckungsdatenbot,content hereof,error herebi,hereof,interfacegert werner,gmbh co,amtsgericht iserlohn,betrieb,busi secret,contain trade,die empfangen,empfangen,enthlt betrieb,haft,iserlohn hra,ist ihnen,kg per,per haft,trade busi,untersagt,untersagt bitt,weitergab oder,dimitry

In [814]:
vectorizer = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [815]:
X_train.shape

(5755, 5164)

In [816]:
# naive bayes classifier

In [817]:
classifier = naive_bayes.MultinomialNB()

In [818]:
X_test = dtf_test["text_clean"].values

In [819]:
## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier)])
## train classifier
model["classifier"].fit(X_train, y_train)
## test

predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [820]:
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, predicted)

print("Accuracy:",  round(accuracy,2))

print("Detail:")
print(metrics.classification_report(y_test, predicted))
    

Accuracy: 0.49
Detail:


  average_options = (None, 'micro', 'macro', 'weighted', 'samples')


               precision    recall  f1-score   support

          ACH       1.00      0.23      0.38        13
        ACTIA       0.00      0.00      0.00        16
      ADMINUI       0.00      0.00      0.00         2
          ADS       0.78      0.50      0.61        28
          ADV       0.00      0.00      0.00         5
        AGILE       0.00      0.00      0.00         1
          ALP       0.00      0.00      0.00        13
     ALTTOSNX       0.00      0.00      0.00         1
         AQUA       0.00      0.00      0.00         4
           AR       1.00      0.18      0.30        17
         ARAS       1.00      0.20      0.33        41
          ASM       0.89      0.82      0.85        49
          ASS       0.00      0.00      0.00         4
         ASYS       1.00      0.45      0.62        20
          BAH       0.00      0.00      0.00        13
          BDT       0.00      0.00      0.00         8
          BEC       0.00      0.00      0.00         5
         

In [821]:
# SVM

In [822]:
classifier_svm = svm.SVC(kernel='linear',  probability=True)

In [823]:
## pipeline
model_svm = pipeline.Pipeline([("vectorizer", vectorizer),  
                           ("classifier", classifier_svm)])

In [824]:
# fit the training dataset on the classifier
model_svm["classifier"].fit(X_train,y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [825]:
# predict the labels on validation dataset
predicted_svm = model_svm.predict(X_test)
print(predicted_svm)

['ECTR' 'DM' 'BU' ... 'HOLCIM' 'ZSA' 'KOL']


In [826]:

print(metrics.classification_report(y_test, predicted_svm))
    

  average_options = (None, 'micro', 'macro', 'weighted', 'samples')


               precision    recall  f1-score   support

          ACH       1.00      0.85      0.92        13
        ACTIA       1.00      0.75      0.86        16
      ADMINUI       0.00      0.00      0.00         2
          ADS       1.00      0.79      0.88        28
          ADV       0.80      0.80      0.80         5
        AGILE       0.00      0.00      0.00         1
          ALP       1.00      0.54      0.70        13
     ALTTOSNX       0.00      0.00      0.00         1
         AQUA       1.00      0.50      0.67         4
           AR       0.85      0.65      0.73        17
         ARAS       0.43      0.63      0.51        41
          ASM       0.94      0.96      0.95        49
          ASS       1.00      0.50      0.67         4
         ASYS       1.00      0.95      0.97        20
          BAH       1.00      0.69      0.82        13
          BDT       1.00      0.75      0.86         8
          BEC       1.00      1.00      1.00         5
         

In [838]:
test_email = """
From: Walter, Martina
Sent: Friday, February 23, 2018 2:30 PM
To: support-ecad <support-ecad@xplm.com>
Cc: Simmat, Renato <renato.simmat@xplm.com>; Wilke, Beate <beate.wilke@xplm.com>
Subject: Urgent - Draeger Safety - Design Comparison
Importance: High

 

Hello,

 

Mr. Meves from Draeger Safety called and asked for urgent support re Design Comparison:

He has updated to Allegro 17.2 and Design Comparison does not work.

 

Other issue:

He need also a product package with release version which is installable.

Right now they need to pick up different installations.

 

I will also ask him to send log files or description if available.

 

BR Martina

 

Mit freundlichem Gruß / Best Regards

 

Martina Walter

Senior Business Administrator

 

XPLM Solution GmbH

Heinrich-Hertz-Str. 4

79211 Denzlingen, Germany

www.xplm.com

martina.walter@xplm.com

 

Office: +49 7666 90398-511 (neue Nummer) / Fax: +49 7666 90398-520

 

Registergericht / Commercial Register Dresden: HRB 24804

Geschäftsführer / Managing Directors: Rolf Pfenning, Karl Wachtel

 

Der Inhalt dieser E-Mail ist vertraulich und ausschließlich für den bezeichneten Adressaten bestimmt. Wenn Sie nicht der vorgesehene Adressat dieser E-Mail oder dessen Vertreter sein sollten, so beachten Sie bitte, dass jede Form der Kenntnisnahme, Veröffentlichung, Vervielfältigung oder Weitergabe des Inhalts dieser Mail unzulässig ist. Wir bitten Sie, sich in diesem Fall mit dem Absender der E-Mail in Verbindung zu setzen.

 

This e-mail message including any attachments is for the sole use of the intended recipient(s) and may contain privileged or confidential information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please immediately contact the sender by reply e-mail and delete the original message and destroy all copies thereof.
"""

In [839]:
test_email_pr = utils_preprocess_text(test_email, flg_stemm=True, flg_lemm=False, 
          lst_stopwords=lst_stopwords)

In [840]:
model_svm.predict([test_email_pr])

array(['DM'], dtype=object)

In [841]:
prob = model_svm.predict_proba([test_email_pr])

In [831]:
for i in range(len(prob[0])):
    print(prob[0][i],classes[i])

0.003996905354254787 ACH
0.0034672897290130805 ACTIA
0.001911553775351473 ADMINUI
0.006822001691244931 ADS
0.0027243974532785804 ADV
0.0010813522514441423 AGILE
0.003057473272169369 ALP
0.0004411693383703346 ALTTOSNX
0.003147224171368263 AQUA
0.002309956650885201 AR
0.013735763854535718 ARAS
0.006780152398915884 ASM
0.0007474882947346228 ASS
0.009356511208944143 ASYS
0.005756184476807511 BAH
0.0022346290298092273 BDT
0.000676310439833337 BEC
0.0018071577384684025 BESI
0.003199505114493518 BKH
0.00331689252773582 BMN
0.002610056903332479 BROOKS
0.0011542835067808585 BRUC
0.011267849479074055 BU
0.006816729834734308 BULT
0.008883850263055432 CAR
0.0041412845841350574 CDS
0.0009542469604379312 CHDLTOSNX
0.0003588051893523525 CHDLTOZNX
0.0022174437776888833 CNTRO
0.001734621768213786 COM
0.0036546005357495285 CONTACT
0.002537233201748176 CRE
0.0009568025980063251 CSCH
0.003303861040024308 CSTOSNX
0.0020871504988280915 CUBIC
0.0017826129688131068 DEMATIC
0.0013120519348186204 DEVOPS
0.00765

In [832]:
test_email1 = """
From: Voillat, Lionel [mailto:lionel.voillat@siemens.com] 
Sent: Monday, November 20, 2017 7:54 PM
To: support-ecad <support-ecad@xplm.com>
Cc: Hutsch, Marc <Marc.Hutsch@xplm.com>
Subject: TC_1.0_Altium - error on start

Hello,

I work at Siemens Industry Software as Presales (Mechatronics) for Switzerland / Austria and have a problem with your TC_1.0_Altium interface (see attachment). I installed and configured the interface (according to readme.txt) but it doesn't work. I see the "Integrate" window, then nothing more happens.

Seems to be the following problems:
*********
2017-11-20 14:27:10 DEBUG Caught StandardError: IVS is not available or incompatible:
Error: OpenTimeout
Message: "execution expired"
*********

I would be happy if you could call me back to solve my problem or email me a possible solution.

Yours sincerely.

Lionel Voillat
Presales PLM / CAD-Central / Eastern Europe
Global Sales & Services

Siemens Industry Software AG
Digital Factory Division
Product lifecycle management
Central & Eastern Europe
DF PL S&SE EU COE BD PSA
Freilagerstrasse 40
8047 Zurich, Switzerland
Tel.: +41 (0) 44 755 72 93
Fax: +41 (0) 44 755 72 70
Mobile: +41 (0) 79 346 68 28
lionel.voillat@siemens.com
www.siemens.com/plm 
More about this source textSource text required for additional translation information
Send feedback
Side panels
"""

In [833]:
test_email1_pr = preprocess_text(test_email1, flg_stemm=True, flg_lemm=False, 
          lst_stopwords=lst_stopwords)

In [834]:
model_svm.predict([test_email1_pr])

array(['SIE'], dtype=object)

In [835]:
prob1 = model_svm.predict_proba([test_email1_pr])

In [836]:
classes = np.unique(y_test)

In [837]:
for i in range(len(prob1[0])):
    print(prob1[0][i],classes[i])

0.0019692925729733785 ACH
0.003848901238983346 ACTIA
0.001571985257829976 ADMINUI
0.0049816111076507125 ADS
0.0014128355837232528 ADV
0.0008297759418025174 AGILE
0.002789299162343698 ALP
0.0003410210570338041 ALTTOSNX
0.0018662303118673503 AQUA
0.003891016017322934 AR
0.011262602936464127 ARAS
0.05409434294475272 ASM
0.0005869195081306493 ASS
0.004709754737372666 ASYS
0.003782911800345254 BAH
0.0017221462744394559 BDT
0.0005660697937911033 BEC
0.0019062965112020365 BESI
0.0032304399181135855 BKH
0.0022777005233462645 BMN
0.0031422759955554757 BROOKS
0.0009103412027239866 BRUC
0.007596056399967188 BU
0.0038522882482610794 BULT
0.007145691372177933 CAR
0.005499886291442115 CDS
0.0006902947594497416 CHDLTOSNX
0.0003162724923852529 CHDLTOZNX
0.0010746493033414828 CNTRO
0.0013773986725137276 COM
0.0021438328045232653 CONTACT
0.003295671093784862 CRE
0.0006513039753075182 CSCH
0.00102483060577782 CSTOSNX
0.0021205696896259884 CUBIC
0.0021029581195699163 DEMATIC
0.0012232227980360172 DEVOPS
0

In [171]:
# сохранение обученной модели

In [185]:
import pickle

In [846]:
Pkl_Filename = "company_name_model_svm.pkl"  

In [847]:
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model_svm, file)

In [848]:
# загрузка обученной модели
with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

prob2 = Pickled_LR_Model.predict_proba([test_email_pr])

In [849]:
for i in range(len(prob2[0])):
    print(prob2[0][i],classes[i])

0.003996905354254787 ACH
0.0034672897290130805 ACTIA
0.001911553775351473 ADMINUI
0.006822001691244931 ADS
0.0027243974532785804 ADV
0.0010813522514441423 AGILE
0.003057473272169369 ALP
0.0004411693383703346 ALTTOSNX
0.003147224171368263 AQUA
0.002309956650885201 AR
0.013735763854535718 ARAS
0.006780152398915884 ASM
0.0007474882947346228 ASS
0.009356511208944143 ASYS
0.005756184476807511 BAH
0.0022346290298092273 BDT
0.000676310439833337 BEC
0.0018071577384684025 BESI
0.003199505114493518 BKH
0.00331689252773582 BMN
0.002610056903332479 BROOKS
0.0011542835067808585 BRUC
0.011267849479074055 BU
0.006816729834734308 BULT
0.008883850263055432 CAR
0.0041412845841350574 CDS
0.0009542469604379312 CHDLTOSNX
0.0003588051893523525 CHDLTOZNX
0.0022174437776888833 CNTRO
0.001734621768213786 COM
0.0036546005357495285 CONTACT
0.002537233201748176 CRE
0.0009568025980063251 CSCH
0.003303861040024308 CSTOSNX
0.0020871504988280915 CUBIC
0.0017826129688131068 DEMATIC
0.0013120519348186204 DEVOPS
0.00765

In [145]:
X_train

<6596x6135 sparse matrix of type '<class 'numpy.float64'>'
	with 139444 stored elements in Compressed Sparse Row format>

In [147]:
y_train.shape

(6596,)

In [652]:
nltk.__version__

'3.4.5'

In [654]:
import sklearn

In [655]:
sklearn.__version__

'0.22.1'

In [850]:
from sklearn.model_selection import StratifiedKFold,cross_val_score

In [724]:
# stratified k fold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X, Y):
    # select rows
    X_train, X_test = X[train_ix], X[test_ix]
    Y_train, Y_test = Y[train_ix], Y[test_ix]
    # summarize train and test composition
    model_svm.fit(X_train,Y_train)
    predictions = model_svm.predict(X_test)
    print('Accuracy:',accuracy_score(Y_test,predictions))



Accuracy: 0.8178114086146682
Accuracy: 0.8084982537834692
Accuracy: 0.810826542491269
Accuracy: 0.8020954598370198
Accuracy: 0.7968568102444703


In [719]:
X = data_jira_en["text_clean"].values

In [720]:
Y = data_jira_en["company"].values

In [725]:
# cosine simularity

In [732]:
from sklearn.metrics.pairwise import cosine_similarity


In [733]:
X_train

<5755x5164 sparse matrix of type '<class 'numpy.float64'>'
	with 77405 stored elements in Compressed Sparse Row format>

In [788]:
test_email2 = data_jira_en["text"].values[3]

In [789]:
test_email2_pr = preprocess_text(test_email2, flg_stemm=True, flg_lemm=False, 
          lst_stopwords=lst_stopwords)

In [791]:
email_vector2 = vectorizer.fit_transform([test_email2_pr])

In [792]:
cosine_sim = cosine_similarity(email_vector2,X_train).flatten()

In [794]:
cosine_sim

array([0.        , 0.        , 0.02679404, ..., 0.        , 0.01795977,
       0.        ])

In [795]:
# 5 наиболее похожих текстов
docs_indexes = cosine_sim.argsort()[:-5:-1]
docs_indexes

array([3723, 1688,  987, 5045], dtype=int64)

In [797]:
cosine_sim[docs_indexes]

array([0.96702668, 0.74645723, 0.74645723, 0.74645723])

In [802]:
best_mail = dtf_train["text"].values[docs_indexes[0]]

In [806]:
best_mail

'{html} **From:** E.Amez@achenbach.de  \r\n **Sent:** 29-May-17 12:20:07 PM  \r\n **To:** support-ecad  \r\n **Subject:** [ACH] EPLAN: problem report  \r\n  \r\n\r\nPlease find problem related files in zip archive attached.\r\n\r\n{html}\r\n\r\n'

In [805]:
test_email2

'{html} **From:** E.Amez@achenbach.de  \r\n **Sent:** 29-May-17 11:38:42 AM  \r\n **To:** support-ecad  \r\n **Subject:** [ACH] EPLAN: problem report  \r\n  \r\n\r\nPlease find problem related files in zip archive attached.\r\n\r\n{html}\r\n\r\n'

In [857]:
cross_val_score(model,X,Y,cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=1))



array([0.49476257, 0.50611247, 0.49842822])

In [858]:
cross_val_score(model_svm,X,Y,cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=1))



array([0.80027933, 0.79811387, 0.78833392])