In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

%matplotlib inline

In [None]:
train = pd.read_csv('train_dataset_train.csv')

<b>Посмотрим на тергеты<b><br>
    <b>Тематика</b>

In [None]:
train["Тематика"].value_counts()

In [None]:
train["Тематика"].value_counts().hist()

<b>Категория</b>

In [None]:
train["Категория"].value_counts()

<b>Ответственное лицо</b>

In [None]:
train["Ответственное лицо"].value_counts()

<b>Связь тематики и категории</b>

In [None]:
t = train.groupby(["Тематика", "Категория"])["id"].count().reset_index()
t['id'] = np.ones(t.shape[0])
cat_topic_links = t.pivot(columns = "Категория", index="Тематика", values="id").fillna(0)
t.to_excel("cat_topic.xlsx")
# каждая тема отнесена к одной из категорий
(cat_topic_links.sum(axis=1)>1).mean()

<b>Связь категории и ответственного лица</b>

In [None]:
t2 = train.groupby(["Ответственное лицо", "Категория"])["id"].count().reset_index()
t2['id'] = np.ones(t2.shape[0])
cat_rel_links = t2.pivot(columns = "Категория", index="Ответственное лицо", values="id").fillna(0)
cat_rel_links.shape

In [None]:
cat_rel_links.to_excel("cat_rel_links.xlsx")
cat_rel_links

In [None]:
<b>Посмотрим, какой мусор есть в сообщениях</b>

In [None]:
train["Текст Сообщения"][5]

In [None]:
train.iloc[4]

<b>Предобработка текстов</b><br>
<b>Убираю HTML</b>

In [None]:
import re
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
SPACER = re.compile('\s+')

def cleanhtml(raw_html):
  cleantext = re.sub(CLEANR, ' ', raw_html)
  cleantext = re.sub(SPACER, ' ', cleantext)
  return cleantext
train["text"] = train["Текст Сообщения"].apply(cleanhtml)

<b>Поиск адреса в сообщении<b>

<b>stemming</b>

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("russian")

def stemtext(input_text):
  words = input_text.split(" ")
  return ' '.join([stemmer.stem(w) for w in words])
train.text = train.text.apply(stemtext)

<b>Lemmatisation</b>

In [None]:
<b>Предскажем тематику по тексту<b><br>
<b>TF-IDF</b>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
td = TfidfVectorizer(max_features = 4500)
#X = td.fit_transform(X).toarray()

In [None]:
td = td.fit(train.text)

In [None]:
train_X = td.transform(train.text).toarray()

In [None]:
train_y = train['Категория']

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
clf = OneVsRestClassifier(LogisticRegression(solver='sag'))

In [None]:
clf_fitted = clf.fit(train_X, train_y)

In [None]:
train_predicted = clf_fitted.predict_proba(train_X)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_true=train_y, y_score=train_predicted, multi_class='ovo')