In [1]:
## for data
import json
import pandas as pd
import numpy as np

In [2]:
## language recognition
from langdetect import detect

In [3]:
from googletrans import Translator

In [4]:
## for processing
import re
import nltk

In [5]:
# for mutilang lemmatization
import simplemma

In [6]:
# for plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
## for bag-of-words
from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection, metrics,svm
## for word embedding
import gensim
import gensim.downloader as gensim_api

In [11]:
from imblearn.over_sampling import SMOTE

In [12]:
# данные из jira
data_jira = pd.read_csv('data_from_jira.csv')
data_jira.shape

(10965, 2)

In [13]:
data_jira.head()

Unnamed: 0,text,company
0,{html} **From:** E.Amez@achenbach.de \r\n **S...,ACH
1,{html} **From:** /O=EXCHANGELABS/OU=EXCHANGE A...,ACH
2,{html} **From:** /O=EXCHANGELABS/OU=EXCHANGE A...,ACH
3,{html} **From:** /O=EXCHANGELABS/OU=EXCHANGE A...,ACH
4,{html} **From:** E.Amez@achenbach.de \r\n **S...,ACH


In [14]:
data_jira.shape

(10965, 2)

In [15]:
# удаление строк с проектами demo, integrate
data_jira = data_jira[(data_jira.company!="DEMO") & (data_jira.company!="INTEGRATE")]

In [16]:
data_jira.shape

(9865, 2)

In [17]:
text_langs = list(map(lambda x:detect(x)=="en" ,data_jira.text))

In [18]:
data_jira_not_en = data_jira[~np.array(text_langs)]

In [531]:
data_jira_not_en["company"].reset_index().groupby("company").count().sort_values(by="index")

Unnamed: 0_level_0,index
company,Unnamed: 1_level_1
IKA,1
CCZTOZNX,1
CHDLTOSNX,1
CSCH,1
MB,1
...,...
TBN,33
IMA,36
VOS,36
TURCK,79


In [308]:
# удаление строк не с английским языком
data_jira_en = data_jira[text_langs]

In [304]:
# распределение компаний
n_companies_sorted = data_jira_en["company"].reset_index().groupby("company").count().sort_values(by="index")

In [354]:
n_companies_sorted

Unnamed: 0_level_0,index
company,Unnamed: 1_level_1
SMA,1
ZZZINTERNAL,1
SHURE,1
SIMENSICN,1
CORSITE,1
...,...
EPDM,242
SVC,284
ECTR,335
SIE,394


In [309]:
data_jira_en = data_jira_en[data_jira_en["company"].map(lambda x: n_companies_sorted.loc[x][0] > 3)]

In [381]:
n_companies_dict_en = dict(data_jira_en["company"].reset_index().groupby("company").count().sort_values(by="index"))["index"]

In [78]:
ps = nltk.stem.porter.PorterStemmer()
lem = nltk.stem.wordnet.WordNetLemmatizer()
# препроцессинг текста
def preprocess_text(text, flg_stemm=False, flg_lemm=True, lst_stopwords=None):
    # текст переводится в нижний регистр, оставляются только буквы
    text = re.sub('[^A-Za-z ]+', '', text.lower().strip())
    
    # токенизация - разбиение на слова
    lst_text = text.split()
    
    # удаление стоп слов
    if lst_stopwords is not None:
        lst_text = [word for word in lst_text if word not in 
                    lst_stopwords]
    
    # стемминг
    if flg_stemm:
        
        lst_text = [ps.stem(word) for word in lst_text]
         
    # леммантизация
    if flg_lemm:
        
        lst_text = [lem.lemmatize(word) for word in lst_text]
      
    
            
    # перевод обратно в текст
    text = " ".join(lst_text)
    return text

In [79]:
lst_stopwords = nltk.corpus.stopwords.words("english")
lst_stopwords

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [80]:
len(lst_stopwords)

179

In [327]:
data_jira_en["text_clean"] = data_jira_en["text"].apply(lambda x: 
          preprocess_text(x, flg_stemm=True, flg_lemm=False, 
          lst_stopwords=lst_stopwords))

In [409]:
## split dataset
dtf_train, dtf_test = model_selection.train_test_split(data_jira_en, test_size=0.33,shuffle=True,stratify=data_jira_en["company"])
## get target
y_train = dtf_train["company"].values
y_test = dtf_test["company"].values

In [437]:
dtf_train["company"].reset_index().groupby("company").count().sort_values(by="index")

Unnamed: 0_level_0,index
company,Unnamed: 1_level_1
LEAR,3
ZF,3
ADMINUI,3
THO,3
PHI,3
...,...
EPDM,162
SVC,190
ECTR,224
SIE,264


In [1]:
## Tf-Idf (advanced variant of BoW)
vectorizer = feature_extraction.text.TfidfVectorizer(max_features=10000, ngram_range=(1,2))

NameError: name 'feature_extraction' is not defined

In [411]:
corpus = dtf_train["text_clean"]

In [412]:
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [413]:
# отбор признаков для уменьшения размерности матрицы с помощью хи-квадрат

In [414]:
y = dtf_train["company"]
X_names = vectorizer.get_feature_names()
p_value_limit = 0.0001
dtf_features = pd.DataFrame()
for cat in np.unique(y):
    chi2, p = feature_selection.chi2(X_train, y==cat)
    dtf_features = dtf_features.append(pd.DataFrame(
                   {"feature":X_names, "score":p, "y":cat}))
    dtf_features = dtf_features.sort_values(["y","score"], 
                    ascending=[True,False])
    dtf_features = dtf_features[dtf_features["score"]<p_value_limit]
X_names = dtf_features["feature"].unique().tolist()

In [415]:
vectorizer = feature_extraction.text.TfidfVectorizer(vocabulary=X_names)
vectorizer.fit(corpus)
X_train = vectorizer.transform(corpus)
dic_vocabulary = vectorizer.vocabulary_

In [416]:
X_train.shape

(5736, 4978)

In [417]:
# naive bayes classifier

In [418]:
classifier = naive_bayes.MultinomialNB()

In [419]:
X_test = dtf_test["text_clean"].values

In [420]:
y_train

array(['FILL', 'ZZZM', 'EPDM', ..., 'PDX', 'CAR', 'ASM'], dtype=object)

In [421]:
y_train.shape

(5736,)

In [422]:
sampling = {}

In [423]:
for c in n_companies_dict_en.keys():
    sampling[c] = max(100,n_companies_dict_en[c])

In [424]:
sampling

{'ZF': 100,
 'LEAR': 100,
 'SDTOSNX': 100,
 'PHI': 100,
 'THO': 100,
 'METRO': 100,
 'ADMINUI': 100,
 'SUP': 100,
 'CSTOSNX': 100,
 'PSB': 100,
 'BKH': 100,
 'CSCH': 100,
 'DEMATIC': 100,
 'ORTOSNX': 100,
 'PADSTOSNX': 100,
 'FRE': 100,
 'XPLM': 100,
 'DEVOPS': 100,
 'LMO': 100,
 'WWERK': 100,
 'TEST': 100,
 'FINCAD': 100,
 'ISP': 100,
 'COM': 100,
 'INF': 100,
 'PULSONIXTOSNX': 100,
 'SLOAN': 100,
 'OTTOBOCK': 100,
 'PDX': 100,
 'ITEL': 100,
 'LEGO': 100,
 'MAG': 100,
 'STORENGY': 100,
 'CHDLTOSNX': 100,
 'REGA': 100,
 'ELE': 100,
 'DITTEL': 100,
 'MENTOREXTR': 100,
 'ASS': 100,
 'HOLCIM': 100,
 'AQUA': 100,
 'VB': 100,
 'KOIKE': 100,
 'OMM': 100,
 'ERBE': 100,
 'HYDR': 100,
 'LANDISGYR': 100,
 'LEO': 100,
 'KAL': 100,
 'EST': 100,
 'KOL': 100,
 'MG': 100,
 'FESTO': 100,
 'MATRIX': 100,
 'BEC': 100,
 'PROTELTOSNX': 100,
 'RHE': 100,
 'RUH': 100,
 'VRAIL': 100,
 'CNTRO': 100,
 'ADV': 100,
 'HAR': 100,
 'INSP': 100,
 'DOM': 100,
 'GLUTH': 100,
 'VAT': 100,
 'VT': 100,
 'SIM': 100,
 'HOL

In [425]:
oversampler=SMOTE(k_neighbors = 2,sampling_strategy=sampling)

In [426]:
X_train, y_train = oversampler.fit_resample(X_train, y_train)

  f"After over-sampling, the number of samples ({n_samples})"
  f"After over-sampling, the number of samples ({n_samples})"


In [427]:
X_train.shape

(19078, 4978)

In [435]:
X_train

<19078x4978 sparse matrix of type '<class 'numpy.float64'>'
	with 331308 stored elements in Compressed Sparse Row format>

In [440]:
## pipeline
model = pipeline.Pipeline([("vectorizer", vectorizer),
                           ("classifier", classifier)])

In [428]:

## train classifier
model["classifier"].fit(X_train, y_train)
## test

predicted = model.predict(X_test)
predicted_prob = model.predict_proba(X_test)

In [429]:
## Accuracy, Precision, Recall
accuracy = metrics.accuracy_score(y_test, predicted)

print("Accuracy:",  round(accuracy,2))

print("Detail:")
print(metrics.classification_report(y_test, predicted))
    

Accuracy: 0.74
Detail:
               precision    recall  f1-score   support

          ACH       0.91      0.77      0.83        13
        ACTIA       1.00      0.69      0.81        16
      ADMINUI       0.00      0.00      0.00         2
          ADS       0.96      0.89      0.93        28
          ADV       1.00      0.60      0.75         5
          ALP       1.00      0.69      0.82        13
         AQUA       0.67      0.50      0.57         4
           AR       0.85      0.65      0.73        17
         ARAS       1.00      0.20      0.33        40
          ASM       0.98      0.83      0.90        48
          ASS       1.00      1.00      1.00         4
         ASYS       1.00      0.84      0.91        19
          BAH       0.83      0.38      0.53        13
          BDT       1.00      0.86      0.92         7
          BEC       1.00      0.80      0.89         5
         BESI       1.00      1.00      1.00        11
          BKH       1.00      0.50      0

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [355]:
# SVM

In [430]:
classifier_svm = svm.SVC(kernel='linear',  probability=True)

In [431]:
## pipeline
model_svm = Pipeline([("vectorizer", vectorizer),
                           ("classifier", classifier_svm)])

In [432]:
# fit the training dataset on the classifier
model_svm["classifier"].fit(X_train,y_train)

SVC(kernel='linear', probability=True)

In [433]:
# predict the labels on validation dataset
predicted_svm = model_svm.predict(X_test)
print(predicted_svm)

['SVC' 'PTC' 'LAEMPE' ... 'DIONEX' 'GRUNDFOS' 'DM']


In [434]:

print(metrics.classification_report(y_test, predicted_svm))
    

  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

          ACH       0.77      0.77      0.77        13
        ACTIA       0.87      0.81      0.84        16
      ADMINUI       0.00      0.00      0.00         2
          ADS       0.96      0.89      0.93        28
          ADV       0.80      0.80      0.80         5
          ALP       0.73      0.62      0.67        13
         AQUA       1.00      0.50      0.67         4
           AR       0.73      0.65      0.69        17
         ARAS       0.58      0.47      0.52        40
          ASM       0.96      0.92      0.94        48
          ASS       1.00      1.00      1.00         4
         ASYS       0.95      0.95      0.95        19
          BAH       0.89      0.62      0.73        13
          BDT       1.00      0.86      0.92         7
          BEC       0.80      0.80      0.80         5
         BESI       1.00      1.00      1.00        11
          BKH       1.00      0.50      0.67         2
         

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [186]:
test_email = """
From: Walter, Martina
Sent: Friday, February 23, 2018 2:30 PM
To: support-ecad <support-ecad@xplm.com>
Cc: Simmat, Renato <renato.simmat@xplm.com>; Wilke, Beate <beate.wilke@xplm.com>
Subject: Urgent - Draeger Safety - Design Comparison
Importance: High

 

Hello,

 

Mr. Meves from Draeger Safety called and asked for urgent support re Design Comparison:

He has updated to Allegro 17.2 and Design Comparison does not work.

 

Other issue:

He need also a product package with release version which is installable.

Right now they need to pick up different installations.

 

I will also ask him to send log files or description if available.

 

BR Martina

 

Mit freundlichem Gruß / Best Regards

 

Martina Walter

Senior Business Administrator

 

XPLM Solution GmbH

Heinrich-Hertz-Str. 4

79211 Denzlingen, Germany

www.xplm.com

martina.walter@xplm.com

 

Office: +49 7666 90398-511 (neue Nummer) / Fax: +49 7666 90398-520

 

Registergericht / Commercial Register Dresden: HRB 24804

Geschäftsführer / Managing Directors: Rolf Pfenning, Karl Wachtel

 

Der Inhalt dieser E-Mail ist vertraulich und ausschließlich für den bezeichneten Adressaten bestimmt. Wenn Sie nicht der vorgesehene Adressat dieser E-Mail oder dessen Vertreter sein sollten, so beachten Sie bitte, dass jede Form der Kenntnisnahme, Veröffentlichung, Vervielfältigung oder Weitergabe des Inhalts dieser Mail unzulässig ist. Wir bitten Sie, sich in diesem Fall mit dem Absender der E-Mail in Verbindung zu setzen.

 

This e-mail message including any attachments is for the sole use of the intended recipient(s) and may contain privileged or confidential information. Any unauthorized review, use, disclosure or distribution is prohibited. If you are not the intended recipient, please immediately contact the sender by reply e-mail and delete the original message and destroy all copies thereof.
"""

In [187]:
test_email_pr = preprocess_text(test_email, flg_stemm=True, flg_lemm=False, 
          lst_stopwords=lst_stopwords)

In [189]:
model_svm.predict([test_email_pr])

array(['DRA'], dtype=object)

In [190]:
prob = model_svm.predict_proba([test_email_pr])

In [191]:
for i in range(len(prob[0])):
    print(prob[0][i],classes[i])

0.0009376505795686223 ACH
0.0020258835301701574 ACTIA
0.0003695894185841659 ADMINUI
0.036497911015276635 ADS
0.001081111481639749 ADV
0.00021281905617563402 AGILE
0.0015461523030676123 ALP
0.0011735385374061157 AQUA
0.0013655094312084932 AR
0.0073966340745679595 ARAS
0.002814444986074204 ASM
0.00038886928690409035 ASS
0.0052955704401456225 ASYS
0.0021246663251300512 BAH
0.0005317966315344788 BDT
0.00027330078009509836 BEC
0.0008603329810849167 BESI
0.0004570104007559562 BKH
0.002273635595468775 BMN
0.000949202992501644 BROOKS
0.00026076750460637504 BRUC
0.0040445916412572315 BU
0.004740963617813236 BULT
0.005391308532029699 CAR
0.002627760555896639 CDS
0.0004778748867065665 CHDLTOSNX
0.0012001851283771356 CNTRO
0.0005771200519720606 COM
0.0019091157031794399 CONTACT
0.0009387841833685117 CRE
0.00025701034336547246 CSCH
0.0005023640261864442 CSTOSNX
0.0011949931962610487 CUBIC
0.0004678590404589153 DEMATIC
0.00033537264536267494 DEVOPS
0.00590036585563324 DFF
0.003324394148877582 DIONEX

In [192]:
test_email1 = """
From: Voillat, Lionel [mailto:lionel.voillat@siemens.com] 
Sent: Monday, November 20, 2017 7:54 PM
To: support-ecad <support-ecad@xplm.com>
Cc: Hutsch, Marc <Marc.Hutsch@xplm.com>
Subject: TC_1.0_Altium - error on start

Hello,

I work at Siemens Industry Software as Presales (Mechatronics) for Switzerland / Austria and have a problem with your TC_1.0_Altium interface (see attachment). I installed and configured the interface (according to readme.txt) but it doesn't work. I see the "Integrate" window, then nothing more happens.

Seems to be the following problems:
*********
2017-11-20 14:27:10 DEBUG Caught StandardError: IVS is not available or incompatible:
Error: OpenTimeout
Message: "execution expired"
*********

I would be happy if you could call me back to solve my problem or email me a possible solution.

Yours sincerely.

Lionel Voillat
Presales PLM / CAD-Central / Eastern Europe
Global Sales & Services

Siemens Industry Software AG
Digital Factory Division
Product lifecycle management
Central & Eastern Europe
DF PL S&SE EU COE BD PSA
Freilagerstrasse 40
8047 Zurich, Switzerland
Tel.: +41 (0) 44 755 72 93
Fax: +41 (0) 44 755 72 70
Mobile: +41 (0) 79 346 68 28
lionel.voillat@siemens.com
www.siemens.com/plm 
More about this source textSource text required for additional translation information
Send feedback
Side panels
"""

In [193]:
test_email1_pr = preprocess_text(test_email1, flg_stemm=True, flg_lemm=False, 
          lst_stopwords=lst_stopwords)

In [206]:
model_svm.predict([test_email1_pr])

array(['SVC'], dtype=object)

In [199]:
model.predict([test_email1_pr])

array(['SIE'], dtype='<U13')

In [203]:
prob1 = model_svm.predict_proba([test_email1_pr])

In [204]:
classes = model_svm.classes_

In [205]:
list(zip(classes,prob1[0]))

[('ACH', 0.0007539705353529607),
 ('ACTIA', 0.0017698947679764465),
 ('ADMINUI', 0.00025235011265010483),
 ('ADS', 0.06015861571061604),
 ('ADV', 0.00036021273122330136),
 ('AGILE', 0.00015271859412346205),
 ('ALP', 0.0015276389948705046),
 ('AQUA', 0.0005747688014685657),
 ('AR', 0.0015956930350809007),
 ('ARAS', 0.0065456100935492),
 ('ASM', 0.011107898515143941),
 ('ASS', 0.00029905913891145706),
 ('ASYS', 0.0017598095114674983),
 ('BAH', 0.0015110567364629492),
 ('BDT', 0.000346764791068026),
 ('BEC', 0.00020977200784951384),
 ('BESI', 0.0008025546552235132),
 ('BKH', 0.0002520635465125888),
 ('BMN', 0.001419630054049455),
 ('BROOKS', 0.0014037570406107772),
 ('BRUC', 0.0001848198511668676),
 ('BU', 0.0034529608218419414),
 ('BULT', 0.0017445167546571048),
 ('CAR', 0.005119016954321599),
 ('CDS', 0.003571863064080705),
 ('CHDLTOSNX', 0.00035246016260321027),
 ('CNTRO', 0.0005810017675498952),
 ('COM', 0.0004756004289650517),
 ('CONTACT', 0.0010756748898355507),
 ('CRE', 0.000876854

In [171]:
# сохранение обученной модели

In [159]:
import pickle

In [160]:
Pkl_Filename = "company_name_model_svm.pkl"  

In [161]:
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model_svm, file)

In [163]:
# загрузка обученной модели
with open(Pkl_Filename, 'rb') as file:  
    Pickled_LR_Model = pickle.load(file)

prob2 = Pickled_LR_Model.predict_proba([test_email1_pr])

In [164]:
for i in range(len(prob2[0])):
    print(prob2[0][i],classes[i])

0.0007539705353529607 ACH
0.0017698947679764465 ACTIA
0.00025235011265010483 ADMINUI
0.06015861571061604 ADS
0.00036021273122330136 ADV
0.00015271859412346205 AGILE
0.0015276389948705046 ALP
0.0005747688014685657 AQUA
0.0015956930350809007 AR
0.0065456100935492 ARAS
0.011107898515143941 ASM
0.00029905913891145706 ASS
0.0017598095114674983 ASYS
0.0015110567364629492 BAH
0.000346764791068026 BDT
0.00020977200784951384 BEC
0.0008025546552235132 BESI
0.0002520635465125888 BKH
0.001419630054049455 BMN
0.0014037570406107772 BROOKS
0.0001848198511668676 BRUC
0.0034529608218419414 BU
0.0017445167546571048 BULT
0.005119016954321599 CAR
0.003571863064080705 CDS
0.00035246016260321027 CHDLTOSNX
0.0005810017675498952 CNTRO
0.0004756004289650517 COM
0.0010756748898355507 CONTACT
0.0008768540342267601 CRE
0.0001615739534600346 CSCH
0.00018460568419569843 CSTOSNX
0.0009654971343750382 CUBIC
0.00040257652870967465 DEMATIC
0.00023449466417548742 DEVOPS
0.0031741109833177115 DFF
0.0021788805615903183 DI

In [166]:
Pickled_LR_Model.predict([test_email1_pr])

array(['SVC'], dtype=object)

In [145]:
X_train

<6596x6135 sparse matrix of type '<class 'numpy.float64'>'
	with 139444 stored elements in Compressed Sparse Row format>

In [147]:
y_train.shape

(6596,)

In [652]:
nltk.__version__

'3.4.5'

In [654]:
import sklearn

In [655]:
sklearn.__version__

'0.22.1'

In [717]:
from sklearn.model_selection import StratifiedKFold,cross_validate

In [724]:
# stratified k fold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X, Y):
    # select rows
    X_train, X_test = X[train_ix], X[test_ix]
    Y_train, Y_test = Y[train_ix], Y[test_ix]
    # summarize train and test composition
    model_svm.fit(X_train,Y_train)
    predictions = model_svm.predict(X_test)
    print('Accuracy:',accuracy_score(Y_test,predictions))



Accuracy: 0.8178114086146682
Accuracy: 0.8084982537834692
Accuracy: 0.810826542491269
Accuracy: 0.8020954598370198
Accuracy: 0.7968568102444703


In [719]:
X = data_jira_en["text_clean"].values

In [720]:
Y = data_jira_en["company"].values

In [725]:
# cosine simularity

In [732]:
from sklearn.metrics.pairwise import cosine_similarity


In [733]:
X_train

<5755x5164 sparse matrix of type '<class 'numpy.float64'>'
	with 77405 stored elements in Compressed Sparse Row format>

In [788]:
test_email2 = data_jira_en["text"].values[3]

In [789]:
test_email2_pr = preprocess_text(test_email2, flg_stemm=True, flg_lemm=False, 
          lst_stopwords=lst_stopwords)

In [791]:
email_vector2 = vectorizer.fit_transform([test_email2_pr])

In [792]:
cosine_sim = cosine_similarity(email_vector2,X_train).flatten()

In [794]:
cosine_sim

array([0.        , 0.        , 0.02679404, ..., 0.        , 0.01795977,
       0.        ])

In [795]:
# 5 наиболее похожих текстов
docs_indexes = cosine_sim.argsort()[:-5:-1]
docs_indexes

array([3723, 1688,  987, 5045], dtype=int64)

In [797]:
cosine_sim[docs_indexes]

array([0.96702668, 0.74645723, 0.74645723, 0.74645723])

In [802]:
best_mail = dtf_train["text"].values[docs_indexes[0]]

In [806]:
best_mail

'{html} **From:** E.Amez@achenbach.de  \r\n **Sent:** 29-May-17 12:20:07 PM  \r\n **To:** support-ecad  \r\n **Subject:** [ACH] EPLAN: problem report  \r\n  \r\n\r\nPlease find problem related files in zip archive attached.\r\n\r\n{html}\r\n\r\n'

In [805]:
test_email2

'{html} **From:** E.Amez@achenbach.de  \r\n **Sent:** 29-May-17 11:38:42 AM  \r\n **To:** support-ecad  \r\n **Subject:** [ACH] EPLAN: problem report  \r\n  \r\n\r\nPlease find problem related files in zip archive attached.\r\n\r\n{html}\r\n\r\n'

In [210]:
from collections import Counter

In [211]:
Counter(y_train)

Counter({'FRECH': 349,
         'TBN': 349,
         'DXTOSNX': 349,
         'HUI': 349,
         'SVC': 349,
         'ZEISS': 349,
         'INF': 349,
         'SIE': 349,
         'PTC': 349,
         'OOTB': 349,
         'SMIT': 349,
         'DS': 349,
         'MITLL': 349,
         'VIT': 349,
         'DM': 349,
         'ECTR': 349,
         'CONTACT': 349,
         'TKP': 349,
         'SAU': 349,
         'STRAMA': 349,
         'CAR': 349,
         'BMN': 349,
         'ZSA': 349,
         'EWZ': 349,
         'ERCO': 349,
         'BRUC': 349,
         'EPDM': 349,
         'BAH': 349,
         'LMO': 349,
         'DFF': 349,
         'GRUNDFOS': 349,
         'FILL': 349,
         'ZEP': 349,
         'ASM': 349,
         'PROR': 349,
         'AQUA': 349,
         'SDR': 349,
         'BU': 349,
         'THY': 349,
         'VOS': 349,
         'TURCK': 349,
         'BULT': 349,
         'SDTOSNX': 349,
         'ZZZM': 349,
         'ARAS': 349,
         'LMD': 34

In [249]:
corpus = data_jira_en["text_clean"]

In [250]:
corpus.shape

(8580,)

In [251]:
X = vectorizer.transform(corpus)

In [252]:
X.shape

(8580, 5000)

In [253]:
Y = data_jira_en["company"].values

In [254]:
Y.shape

(8580,)

In [265]:
from sklearn.model_selection import StratifiedKFold,cross_val_score,KFold

In [256]:
oversampler=SMOTE(k_neighbors=1)

In [257]:
X, Y = oversampler.fit_resample(X, Y)

In [438]:
# stratified k fold
kfold = KFold(n_splits=5, shuffle=True)
# enumerate the splits and summarize the distributions
for train_ix, test_ix in kfold.split(X, Y):
    # select rows
    X_train, X_test = X[train_ix], X[test_ix]
    Y_train, Y_test = Y[train_ix], Y[test_ix]
    # summarize train and test composition
    model["classifier"].fit(X_train,Y_train)
    predictions = model["classifier"].predict(X_test)
    print('Accuracy:',metrics.accuracy_score(Y_test,predictions))

Accuracy: 0.9566254608544784
Accuracy: 0.957926697028844
Accuracy: 0.9586292902456216
Accuracy: 0.9607981347936887
Accuracy: 0.9596594914059535


In [441]:
cross_val_score(model["classifier"],X,Y,cv=5)

array([0.9465951 , 0.95738452, 0.96399718, 0.96367185, 0.96567804])

In [267]:
cross_val_score(model_svm["classifier"],X,Y,cv=5)

KeyboardInterrupt: 