In [128]:
import pandas as pd
import numpy as np
import re
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import string
import joblib
import function_library as fl
import math
import plotly.express as px
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from spacy.lang.pt.stop_words import STOP_WORDS
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import classification_report

In [75]:
pd.set_option('display.max_colwidth', 1000)

In [76]:
all_cus_go = pd.read_csv('F:\\project_csv\\ALL_PL_2022.csv', sep=';')

In [77]:
email_encoder = LabelEncoder()

all_cus_go['Requester Email'] = email_encoder.fit_transform(all_cus_go['Requester Email'])

In [78]:
all_cus_go.head()

Unnamed: 0,Ticket Id,Subject,Description,Source,Requester Email,Created Time,Agent interactions,Customer interactions,Tags
0,111345,[DE] [PYTANIE] B07R42W8J9,Klient ma pytanie do produktu B07R42W8J9: &quot;Jakie połączenie ma dachówka wentylacyjna?&quot; Link do zapytania: www.amazon.de/ask/question/Tx21NZNJ8PMI5JH?_encoding=UTF8&amp;authToken=967c969c-3f2e-47dd-b5e6-faeb39d052d1&amp;ref_=s_respond&amp;answerSolicitationSourceType=EMAIL&amp;answerSolicitationLanguage=de_DE Pozdrawiam Beata,Outbound Email,420,10 01 2022 10:47,4,4,"Partner_wysłane,rynek_de"
1,128291,[DE] 302-0014409-7188317 NIEKOMPLETNE,"Klient zgłasza, że w zamówieniu nie było lampki. Pozdrawiam, Jakub",Outbound Email,235,30 01 2022 09:48,3,2,"Partner_wysłane,rynek_de"
2,128325,[REKLAMACJA] 205-9594094-3558723 UK,"Dzień dobry, Klient napisał: Ten produkt dotarł, ale nie działa, nie świeci się. Bardzo rozczarowany, ponieważ kupiłem go dla mojego dziecka. Brak zauważalnych wad, wszystko wydawało się w porządku po otwarciu. Mały przycisk przesuwa się w dół i wydaje się OK, ale nic się nie dzieje, gdy go nacisnąć. To nie świeci się, co było cały punkt z nim, i bardzo rozczarowujące dla mojej córki. Pozdrawiam, Hania",Outbound Email,395,30 01 2022 10:38,2,2,"Partner_wysłane,rynek_uk,Przypomnienie"
3,131092,[DE] [ZAPYTANIE] 305-6969626-5165121,"Klient chciałby wiedzieć, czy materiał, z którego wykonane są klamki może być stosowany również na zewnątrz, np. na bocznych drzwiach wejściowych. Pozdrawiam Beata",Outbound Email,279,2 02 2022 16:10,4,4,"Partner_wysłane,rynek_de,Przypomnienie"
4,133983,[IT] 408-8392161-3066735 ZWROT,"https://www.dhl.com/it-en/home/tracking/tracking-parcel.html?submit=1&amp;tracking-id=CM332431606DE Paczka wraca do was, prosimy o refund dla klienta jak dojdzie Kasia",Outbound Email,279,7 02 2022 08:58,9,6,"Partner_wysłane,rynek_it,Przypomnienie1,Przypomnienie2"


1. Data cleaning, Feature engineering

In [79]:
format_date = "%d %m %Y %H:%M"

all_cus_go['Created Time'] = [datetime.strptime(row, format_date) for row in all_cus_go['Created Time']] 
all_cus_go['Subject'] = [str(row).lower() for row in all_cus_go['Subject']]
all_cus_go['Tags'] = [fl.split_string_by_char(row, ',')for row in all_cus_go['Tags']]
all_cus_go['rynek'] = [fl.extract_string_containing_word(row, 'rynek') for row in all_cus_go['Tags']]
all_cus_go['Order_number'] = [fl.extract_order_number(row1, row2, r"\d{3}-\d{7}-\d{7}") for row1, row2 in zip(all_cus_go['Subject'], all_cus_go['Description'])]
all_cus_go['title_list'] = [fl.split_string_by_char(row, ' ') for row in all_cus_go['Subject']]
all_cus_go['label'] = [fl.find_words_with_more_than_four_characters(row) for row in all_cus_go['Subject']]

In [80]:
all_cus_go.head()

Unnamed: 0,Ticket Id,Subject,Description,Source,Requester Email,Created Time,Agent interactions,Customer interactions,Tags,rynek,Order_number,title_list,label
0,111345,[de] [pytanie] b07r42w8j9,Klient ma pytanie do produktu B07R42W8J9: &quot;Jakie połączenie ma dachówka wentylacyjna?&quot; Link do zapytania: www.amazon.de/ask/question/Tx21NZNJ8PMI5JH?_encoding=UTF8&amp;authToken=967c969c-3f2e-47dd-b5e6-faeb39d052d1&amp;ref_=s_respond&amp;answerSolicitationSourceType=EMAIL&amp;answerSolicitationLanguage=de_DE Pozdrawiam Beata,Outbound Email,420,2022-01-10 10:47:00,4,4,"[Partner_wysłane, rynek_de]",rynek_de,,"[[de], [pytanie], b07r42w8j9]",[pytanie]
1,128291,[de] 302-0014409-7188317 niekompletne,"Klient zgłasza, że w zamówieniu nie było lampki. Pozdrawiam, Jakub",Outbound Email,235,2022-01-30 09:48:00,3,2,"[Partner_wysłane, rynek_de]",rynek_de,['302-0014409-7188317'],"[[de], 302-0014409-7188317, niekompletne]",[niekompletne]
2,128325,[reklamacja] 205-9594094-3558723 uk,"Dzień dobry, Klient napisał: Ten produkt dotarł, ale nie działa, nie świeci się. Bardzo rozczarowany, ponieważ kupiłem go dla mojego dziecka. Brak zauważalnych wad, wszystko wydawało się w porządku po otwarciu. Mały przycisk przesuwa się w dół i wydaje się OK, ale nic się nie dzieje, gdy go nacisnąć. To nie świeci się, co było cały punkt z nim, i bardzo rozczarowujące dla mojej córki. Pozdrawiam, Hania",Outbound Email,395,2022-01-30 10:38:00,2,2,"[Partner_wysłane, rynek_uk, Przypomnienie]",rynek_uk,['205-9594094-3558723'],"[[reklamacja], 205-9594094-3558723, uk]",[reklamacja]
3,131092,[de] [zapytanie] 305-6969626-5165121,"Klient chciałby wiedzieć, czy materiał, z którego wykonane są klamki może być stosowany również na zewnątrz, np. na bocznych drzwiach wejściowych. Pozdrawiam Beata",Outbound Email,279,2022-02-02 16:10:00,4,4,"[Partner_wysłane, rynek_de, Przypomnienie]",rynek_de,['305-6969626-5165121'],"[[de], [zapytanie], 305-6969626-5165121]",[zapytanie]
4,133983,[it] 408-8392161-3066735 zwrot,"https://www.dhl.com/it-en/home/tracking/tracking-parcel.html?submit=1&amp;tracking-id=CM332431606DE Paczka wraca do was, prosimy o refund dla klienta jak dojdzie Kasia",Outbound Email,279,2022-02-07 08:58:00,9,6,"[Partner_wysłane, rynek_it, Przypomnienie1, Przypomnienie2]",rynek_it,['408-8392161-3066735'],"[[it], 408-8392161-3066735, zwrot]",[zwrot]


In [81]:
rynek_encoder = LabelEncoder()

all_cus_go['rynek'] = all_cus_go['rynek'].str.replace('rynek_', '').str.replace('rynek ', '').str.replace('usa','us')
all_cus_go['rynek'] = rynek_encoder.fit_transform(all_cus_go['rynek']) + 1

In [82]:
unique_values = all_cus_go['label'].explode().tolist()
unique_values = [item for item in unique_values if not (isinstance(item, float) and math.isnan(item))]
uniq_vales, uniq_values_count = fl.list_of_unique_values_and_count_of_values(unique_values)
uniq_vales = [str(item) for item in uniq_vales]

In [83]:
nlp = spacy.load("pl_core_news_lg")

uniq_vales_sentence = ' '.join(uniq_vales)

doc = nlp(uniq_vales_sentence)

all_lem = []
all_words = []

for token in doc:
    all_words.append((token.text, token.lemma_))
    all_lem.append(token.lemma_)

dict_of_all_lemmas = {}
for lemma in set(all_lem):
    dict_of_all_lemmas[lemma] = fl.find_all_words_matching_lemma(lemma,all_words)

number_of_occurence = {}
for lemma_key in dict_of_all_lemmas:
    number_of_occurence_value = []
    for word in dict_of_all_lemmas[lemma_key]:
        number_of_occurence_value.append(uniq_values_count[word])
    number_of_occurence[lemma_key] = sum(number_of_occurence_value)

number_of_occurence_sorted = sorted(number_of_occurence.items(), key=lambda x:x[1], reverse=True)

In [84]:
label_regex ={"anulacja" : r'^[a-z]?anu?a?l|^[a-z]?nieanu?a?l',
            "reklamacja" :r'^rek|\w*k{1,}l{1,}a{1,}m{1,}a{1,}\w*',
            "wysyłka" : r'^[a-z]?wysy[łlk]|^[a-z]?niewysy?[łlk]',
            "refund" : r'^[a-z]?r[a-z]?efu',
            "przesyłka" : r'\w*esy[łl]\w*',
            "dostawa" : r'\w*dost[arw]',
            "opóźnienie" : r'^op[oó][zźżx]',
            "uszkodzony" : r'^uszko|\w*kodz\w*',
            "adres" : r'^[a-z]?ad{,2}res|^zaadre|^[a-z]?adr{,2}es',
            "ponowny" : r'^[a-z]?po{0,2}[a-z]?no',
            "paczka" : r'^paczk',
            "pytanie" : r'\w*yta\w*|^zap[yt]',
            "zwrot" : r'^zwr|\w*wrot\w*',
            "brak" : r'^[a-z]?brak',
            "niekompletny" : r'^niekom[ple]{1,}[ple]{1,}|\W*komopl',
            "faktura" : r'^fak',
            "inne" : r'^in[nm]|^i[nm]n'
}


In [85]:
dict_of_labels_and_matching_lemma_keys = {}

for label in label_regex:
    dict_of_labels_and_matching_lemma_keys[label] = []
    pattern = re.compile(label_regex[label])
    for label_key in dict_of_all_lemmas:
        if re.search(pattern, label_key):
            dict_of_labels_and_matching_lemma_keys[label].append(label_key)

not_working_keys, final_dict = fl.match_words_to_labels(dict_of_labels_and_matching_lemma_keys, dict_of_all_lemmas)
len(not_working_keys)

0

In [86]:
all_cus_go['final_label'] = [fl.find_label_in_list_of_words_from_title(final_dict, row) for row in all_cus_go['label']]
labels_distribution = pd.DataFrame(all_cus_go['final_label'].value_counts())
total_count = sum(labels_distribution['final_label'])
labels_distribution['final_label_%'] = [round(count/total_count*100,2) for count in labels_distribution['final_label']]
labels_distribution['final_label_%_cumulative'] = labels_distribution['final_label_%'].cumsum()
labels_distribution = labels_distribution.reset_index()
labels_distribution = labels_distribution.rename(columns={'final_label': 'label_count', 'index' : 'labels'})
labels_distribution.reset_index(drop=True, inplace=True)
labels_distribution

Unnamed: 0,labels,label_count,final_label_%,final_label_%_cumulative
0,dostawa,8193,22.6,22.6
1,zwrot,5877,16.21,38.81
2,anulacja,5421,14.95,53.76
3,reklamacja,4713,13.0,66.76
4,pytanie,2657,7.33,74.09
5,faktura,1859,5.13,79.22
6,wysyłka,1306,3.6,82.82
7,paczka,1255,3.46,86.28
8,adres,963,2.66,88.94
9,uszkodzony,961,2.65,91.59


2. Exploratory data analysis 

In [87]:
sunburst_df = pd.DataFrame(columns=['ids','label','parent','val', 'val_perc'])

root = {'ids' : 'labels_distribution','label' : 'labels_distribution','parent' : '','val' : all_cus_go['final_label'].count(), 'val_perc' : 100}
sunburst_df = sunburst_df.append(root, ignore_index=True) 

grouped_dict = labels_distribution[['labels', 'label_count']]
grouped_dict.set_index('labels', inplace=True)
grouped_dict = grouped_dict['label_count'].to_dict()

grouped = pd.DataFrame(all_cus_go['final_label'].value_counts())
grouped = grouped.reset_index()
grouped['ids'] = grouped['index']
grouped = grouped.rename(columns={'index' : "label", 'final_label' : 'val'})
grouped['parent'] = "labels_distribution"
grouped['val_perc'] = [fl.count_percentage_value_from_group(grouped_dict, [row1, row2]) for row1, row2 in zip(grouped['label'], grouped['val'])]
sunburst_df = pd.concat([sunburst_df, grouped], ignore_index=True)

grouped = all_cus_go.groupby(['final_label', 'rynek']).size().reset_index(name='val')
grouped['ids'] = [f"{final_label} - {rynek}" for rynek, final_label in zip(grouped["rynek"], grouped["final_label"])]
grouped = grouped.rename(columns={'rynek' : "label", 'final_label' : 'parent'})
grouped['val_perc'] = [fl.count_percentage_value_from_group(grouped_dict, [row1, row2]) for row1, row2 in zip(grouped['parent'], grouped['val'])]
sunburst_df = pd.concat([sunburst_df, grouped])

In [88]:
fig =go.Figure(go.Sunburst(
    ids = sunburst_df.ids,
    labels = sunburst_df.label,
    parents = sunburst_df.parent,
    values = sunburst_df.val_perc,
    branchvalues="total",
    insidetextorientation="radial",
))

fig.update_layout(
    title=dict(text="Labels distribution by country number", font=dict(size=30), automargin=True, yref='paper'),
    margin = dict(t=10, l=10, r=10, b=10),
    autosize=True,
    width=1000,
    height=1000,
)

fig.show()

On the label distribution plot, it is evident that over 50% of messages are distributed among three labels, and over 75% of messages are distributed among six labels. The majority of messages are directed to countries numbered 5, 7, and 8. Utilizing a model that covers these six major labels will likely result in the labeling of most incoming messages. It is recommended for future analysis to determine if any of major label messeges can be answered or partially addressed without the intervention of employees.

In [89]:
time_analys = pd.DataFrame(all_cus_go['Created Time'])
time_analys['day_of_month'] = time_analys['Created Time'].dt.day
time_analys['hours'] = time_analys['Created Time'].dt.hour
time_analys['working_hours'] = ["Working hours" if 8 <= row < 16 else "Not working hours" for row in time_analys['hours']]

df_time_analys = time_analys.groupby(['day_of_month', 'working_hours']).size().reset_index(name='val')
df_time_analys.sort_values(['day_of_month', 'working_hours'],  inplace=True)

time_analys_grouped = df_time_analys.groupby('day_of_month').sum()
time_analys_dict = time_analys_grouped['val'].to_dict()

df_time_analys['perc_val'] = [round(fl.count_percentage_value_for_label(time_analys_dict, [row1, row2]),2) for row1, row2 in zip(df_time_analys['day_of_month'], df_time_analys['val'])]

In [90]:
fig = px.bar(df_time_analys, x='day_of_month', y="perc_val", 
             color='working_hours', text="working_hours", text_auto='.3s',
             labels={'day_of_month':'Day of month', 'perc_val': '% of messages per day', "working_hours": "Hours"},
             title="Messages by day of month and working hours")

fig.show()

In [91]:
time_analys = pd.DataFrame(all_cus_go['Created Time'])
time_analys['day_of_week'] = time_analys['Created Time'].dt.dayofweek
time_analys['day_name'] = time_analys['Created Time'].dt.strftime('%A')
time_analys['hours'] = time_analys['Created Time'].dt.hour
time_analys['hours_group'] = pd.cut(time_analys['hours'], bins=3, labels=['0-7','8-15','16-24'])

df_time_analys = time_analys.groupby(['day_name', 'day_of_week', 'hours_group']).size().reset_index(name='val')
df_time_analys.sort_values(['day_of_week', 'hours_group'],  inplace=True)
df_time_analys = df_time_analys[df_time_analys['val'] != 0]

In [92]:
fig = px.bar(df_time_analys, x="day_name", y="val", color="hours_group", text="hours_group",text_auto=True,
             labels={'day_name':'Day', 'val': 'Number of messages per day', "hours_group": "Hours"},
             title="Messages by day of week and hours")

fig.show()

The plotted distribution of labels by days reveals that approximately 25% to 33% of incoming messages occur outside regular working hours. Remarkably, the message influx remains consistent, demonstrating minimal dependency on the day of the week. However, during weekends, there is an uptick to nearly 50% of the usual volume observed on regular working days. It's recommended to conduct further analysis specifically on weekend messages, focusing on identifying fields that could benefit from automation.

In [93]:
interactions = pd.DataFrame(all_cus_go[['Agent interactions', 'final_label']])

high_volume = ['dostawa', 'zwrot', 'anulacja', 'reklamacja']
medium_volume = ['pytanie', 'faktura', 'wysyłka', 'paczka']

interactions['volume'] = ['High volume' if row in high_volume else 'Medium volume' if row in medium_volume else 'Low volume' for row in interactions['final_label']]

upper_bound = interactions['Agent interactions'].quantile(0.975)

interactions_filtered = interactions[
    (interactions['Agent interactions'] <= upper_bound)
]

fig = px.box(interactions_filtered, x='final_label', y='Agent interactions',
    points='suspectedoutliers', color='volume',
    labels={'final_label':'Label', 'volume' : 'Messages Volume'},
    title="Agents interactions by labels and messages volume"
)
fig.show()

Upon reviewing agent interactions plots, it's evident that nine out of seventeen labels exhibit a count ranging from one to two interactions. The organization should prioritize investigating the 'reklamacja' and 'zwrot' message groups within the high-volume category to understand the reason behind the heightened interaction volume. Simultaneously, it's crucial to explore avenues to minimize the number of interactions within these groups.

Moreover, the 'Dostawa' and 'anulacja' groups merit thorough examination for potential automation possibilities. Should there be success in automating either the 'Dostawa' or 'anulacja' groups, a similar strategy should be implemented for the medium-volume category, where each label experiences interaction counts ranging between one to two.

Description preprocess for modeling 

In [94]:
polish_names = []

file_path = 'C:\\Users\\48575\\email_classification\\imiona_polskie.txt'

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        polish_names.append({"label" : "persName", "pattern" : [{"TEXT": line.lower().strip()}]})

patterns = [
    {"label" : "number", "pattern" : [{"LIKE_NUM" : True}]},
    {"label" : "web_site", "pattern" : [{"LIKE_URL" : True}]},
    {"label" : "email", "pattern" : [{"LIKE_EMAIL" : True}]},
    {"label" : "ASIN", "pattern" : [{"TEXT" : {"REGEX" : "(?i)b0[a-z0-9]{8}$"}}]}    
]
patterns.extend(polish_names)

ruler = nlp.add_pipe("entity_ruler", before='ner')
ruler.add_patterns(patterns)

In [95]:
def text_preprocessing(text: str) -> str:
    """Preprocesses text, converting words to entities or lemmas.

    This function takes a string as an argument, changes it to an NLP object, iterates through every word, 
    and checks if it is an entity or number. If it's a word, it changes it to a lemma. Every entity, number and word is appended to list. 
    At the end, it joins the list back into one string.

    Args:
        text (str): String to be processed.

    Returns:
        str: Processed text with lemmas and entities.
    """
    try:
        text = text.lower().replace("&quot;", ' " ')
        processed_tokens = []

        doc = nlp(text)

        for token in doc:  
            if token.is_digit:
                processed_tokens.append(token.ent_type_)
            elif len(token.lemma_) > 3:
                if token.ent_type_ != '':
                    processed_tokens.append(token.ent_type_)
                elif not token.is_punct:
                    processed_tokens.append(token.lemma_)

        processed_text = " ".join(processed_tokens)

        return processed_text
    
    except:
        return text

In [96]:
# df_to_model = all_cus_go[['Subject', 'Description', 'final_label']]
# df_to_model = df_to_model[df_to_model['final_label'].notnull() & df_to_model['Description'].notnull()]
# df_to_model['desc_transformed'] = df_to_model['Description'].apply(text_preprocessing)
# df_to_model.dropna(inplace=True)
# df_to_model.to_csv('F:\\project_csv\\df_to_model.csv')

df_to_model = pd.read_csv('F:\\project_csv\\df_to_model.csv')

In [97]:
df_to_model.dropna(inplace=True)

In [98]:
top_4_labels_and_other = labels_distribution['labels'].head(4)
top_4_labels_and_other = top_4_labels_and_other.to_list()
top_4_labels_and_other

df_to_model['final_label_grouped'] = ['other' if row not in top_4_labels_and_other else row for row in df_to_model['final_label']]
df_to_model['final_label_grouped'].value_counts()

other         12054
dostawa        8193
zwrot          5878
anulacja       5398
reklamacja     4711
Name: final_label_grouped, dtype: int64

tfidf

In [99]:
X = df_to_model['desc_transformed']
y = df_to_model['final_label_grouped']

In [100]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                    y_train,
                                                    test_size=0.2,
                                                    random_state=42)

In [101]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train)

In [102]:
X_train_tf = tfidf.transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [103]:
X_train_tf.todense()

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [104]:
tfidf.get_feature_names_out()

array(['00', '0000', '000w', ..., 'żądanie', 'żądany', 'żądać'],
      dtype=object)

In [105]:
pd.DataFrame(X_train_tf.todense(),
             columns=tfidf.get_feature_names_out())

Unnamed: 0,00,0000,000w,00340434390970043528,00340434390980251142,00340434390999113059,00340434391003290278,00340434391112484421,00340434660812861611,00340434660812862144,...,żywo,żywotność,żółtaw,żółtawy,żółto,żółty,żółwi,żądanie,żądany,żądać
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23184,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23186,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23187,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [106]:
df_functions_score = pd.DataFrame(columns=['model_name', 'label', 'score', 'metric'])


In [107]:
# lr = LogisticRegression()
# lr.fit(X_train_tf, y_train)

# joblib.dump(lr, "F:\\Classification_models\\LogisticRegression")

In [108]:
lr_loaded = joblib.load("F:\\Classification_models\\LogisticRegression")


y_pred = lr_loaded.predict(X_test_tf)
print(classification_report(y_test, y_pred))

df_functions_score = pd.concat([df_functions_score, fl.model_score_to_df(y_test, y_pred, "Logistic_regression_basic")], ignore_index=True)

              precision    recall  f1-score   support

    anulacja       0.98      0.94      0.96      1135
     dostawa       0.74      0.85      0.79      1647
       other       0.79      0.75      0.77      2427
  reklamacja       0.74      0.73      0.74       867
       zwrot       0.85      0.78      0.81      1171

    accuracy                           0.81      7247
   macro avg       0.82      0.81      0.81      7247
weighted avg       0.81      0.81      0.81      7247



In [109]:
# rfc = RandomForestClassifier()
# rfc.fit(X_train_tf, y_train)

# joblib.dump(rfc, "F:\\Classification_models\\RandomForestClassifier_basic")

In [110]:
rfc_loaded = joblib.load("F:\\Classification_models\\RandomForestClassifier_basic")

y_pred = rfc_loaded.predict(X_test_tf)
print(classification_report(y_test, y_pred))

df_functions_score = pd.concat([df_functions_score, fl.model_score_to_df(y_test, y_pred, "RandomForestClassifier_basic")], ignore_index=True)

              precision    recall  f1-score   support

    anulacja       0.98      0.96      0.97      1135
     dostawa       0.76      0.88      0.82      1647
       other       0.84      0.78      0.81      2427
  reklamacja       0.75      0.76      0.76       867
       zwrot       0.85      0.80      0.83      1171

    accuracy                           0.83      7247
   macro avg       0.84      0.84      0.84      7247
weighted avg       0.84      0.83      0.83      7247



In [111]:
# mnb = MultinomialNB()
# mnb.fit(X_train_tf, y_train)

# joblib.dump(mnb, "F:\\Classification_models\\mnb_basic")

['F:\\Classification_models\\mnb_basic']

In [112]:
mnb_loaded = joblib.load("F:\\Classification_models\\mnb_basic")

y_pred = mnb_loaded.predict(X_test_tf)
print(classification_report(y_test, y_pred))

df_functions_score = pd.concat([df_functions_score, fl.model_score_to_df(y_test, y_pred, "mnb_basic")], ignore_index=True)

              precision    recall  f1-score   support

    anulacja       0.98      0.91      0.94      1135
     dostawa       0.70      0.81      0.75      1647
       other       0.70      0.74      0.72      2427
  reklamacja       0.75      0.58      0.66       867
       zwrot       0.79      0.71      0.75      1171

    accuracy                           0.76      7247
   macro avg       0.78      0.75      0.76      7247
weighted avg       0.76      0.76      0.76      7247



In [113]:
# svm = SVC()
# svm.fit(X_train_tf, y_train)

# joblib.dump(svm, "F:\\Classification_models\\SVC_basic")

In [114]:
svm_loaded = joblib.load("F:\\Classification_models\\SVC_basic")

y_pred = svm_loaded.predict(X_test_tf)
print(classification_report(y_test, y_pred))

df_functions_score = pd.concat([df_functions_score, fl.model_score_to_df(y_test, y_pred, "SVC_basic")], ignore_index=True)

              precision    recall  f1-score   support

    anulacja       0.98      0.94      0.96      1135
     dostawa       0.76      0.87      0.81      1647
       other       0.83      0.77      0.80      2427
  reklamacja       0.75      0.79      0.77       867
       zwrot       0.86      0.82      0.84      1171

    accuracy                           0.83      7247
   macro avg       0.84      0.84      0.84      7247
weighted avg       0.84      0.83      0.83      7247



In [117]:
df_functions_score

Unnamed: 0,model_name,label,score,metric
0,Logistic_regression_basic,anulacja,0.977022,precision
1,Logistic_regression_basic,anulacja,0.936564,recall
2,Logistic_regression_basic,anulacja,0.956365,f1-score
3,Logistic_regression_basic,dostawa,0.739936,precision
4,Logistic_regression_basic,dostawa,0.848209,recall
...,...,...,...,...
59,SVC_basic,reklamacja,0.774157,f1-score
60,SVC_basic,zwrot,0.862534,precision
61,SVC_basic,zwrot,0.819812,recall
62,SVC_basic,zwrot,0.840630,f1-score


In [124]:
fig = px.scatter(df_functions_score, y="label", x="score", color="model_name", symbol="metric")
fig.show()

PCA

In [115]:


# Sample text data
corpus = X

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Transform text to TF-IDF feature matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Apply TruncatedSVD (PCA equivalent for sparse data) for dimensionality reduction
n_components = 2000  # Number of components after PCA
svd = TruncatedSVD(n_components=n_components)
svd_result = svd.fit_transform(tfidf_matrix)


In [116]:
i == x

NameError: name 'i' is not defined

In [None]:
nlp = spacy.blank("pl")

In [None]:
df = pd.DataFrame(df_to_model[['Description', 'final_label']])
df.reset_index(inplace=True)

In [None]:
REGX_URL = r"(https?://|www\.)[^\s/$.?#].[^\s]*"

def preprocessing(text):
  text = text.lower()
  text = text.replace("&quot;", '"')

  text = re.sub(REGX_URL, ' ', text)

  tokens = [token.text for token in nlp(text)]

  tokens = [t for t in tokens if 
              t not in STOP_WORDS and 
              t not in string.punctuation]

  tokens = [t for t in tokens if not t.isdigit()]

  return " ".join(tokens)

In [None]:
df['Description_cleaned'] = df['Description'].apply(preprocessing)

In [None]:
y =pd.get_dummies((df['final_label']), prefix='label')
label = list(y.columns)
y = y.to_dict('index')

In [None]:
dataset = list(zip(df['Description_cleaned'], [{'cats':cats} for cats in y.values()]))
print(dataset[0])

In [None]:
train_data, test_data = train_test_split(dataset,
                                        test_size=0.25,
                                        random_state=42)
dev_data, test_data = train_test_split(test_data, 
                                       test_size=0.4,
                                       random_state=42)

print(f"Total: {len(dataset)} - Train:  {len(train_data)} - Dev: {len(dev_data)} - Test: {len(test_data)}")

In [None]:
def convert(data, outfile):
    db = spacy.tokens.DocBin()

    for text, labels in data:
        doc = spacy.tokens.Doc(nlp.vocab, words=text.split())
        doc.cats.update(labels['cats'])
        db.add(doc)

    db.to_disk(outfile)

convert(train_data, "./train.spacy")
convert(dev_data, "./dev.spacy")
convert(test_data, "./test.spacy")

In [None]:
textcat = nlp.add_pipe("textcat_multilabel")
for i in label:
    textcat.add_label(i)

In [None]:
# ! python -m spacy init fill-config base_config.cfg config.cfg

In [None]:
# ! python -m spacy train config.cfg --paths.train ./train.spacy  --paths.dev ./dev.spacy --output model --verbose

In [None]:
# output_dir = 'C:\\Users\\48575\\email_classification\\spaCy_model'
# nlp.to_disk(output_dir)

In [None]:
saved_model_dir = 'C:\\Users\\48575\\email_classification\\spaCy_model'
nlp = spacy.load(saved_model_dir)

In [None]:
! python -m spacy evaluate ./model/model-best/ ./test.spacy

In [None]:
def predict_spaCy (text):
    preprocessed_text = preprocessing(text)
    doc = nlp(preprocessed_text)
    label = max(doc.cats, key=doc.cats.get)
    return label

Optimization of Logistic Regression, Random Forest Classifier, and Naive Bayes

In [None]:
X = df['Description_cleaned']
y = df['final_label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
tfidf = TfidfVectorizer()
tfidf.fit(X_train)

In [None]:
X_train_tf = tfidf.transform(X_train)
X_test_tf = tfidf.transform(X_test)

In [None]:
X_train_tf.todense()

In [None]:


lr = LogisticRegression()
lr.fit(X_train_tf, y_train)

y_pred = lr.predict(X_test_tf)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
param_grid = {
    'penalty': ['l1','l2','None'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False],
    'tol' : [0.001,0.01],
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'],
    'max_iter': [100, 200],
    'multi_class': ['auto', 'multinomial'],
}

In [None]:
# searcher=RandomizedSearchCV(lr, param_grid)
# searcher.fit(X_train_tf, y_train)

In [None]:
# joblib.dump(searcher, "LogisticRegression_randomizedSearchCV")

In [None]:
searcher = joblib.load("LogisticRegression_randomizedSearchCV")

In [None]:
print(searcher.best_score_)
print(searcher.best_params_)

In [None]:
lr = LogisticRegression(tol=0.01, solver='newton-cg', penalty='l2', multi_class='multinomial', 
                        max_iter=200, fit_intercept=True, C=10
                        )
lr.fit(X_train_tf, y_train)

y_pred = lr.predict(X_test_tf)


print(classification_report(y_test, y_pred))

In [None]:

rfc = RandomForestClassifier()
rfc.fit(X_train_tf, y_train)

y_pred = rfc.predict(X_test_tf)



print(classification_report(y_test, y_pred))

In [None]:
param_grid = {
    'n_estimators': [50, 125, 200],  # Range of values for n_estimators
    'max_depth': [None, 5, 13, 20],  # Including None for no max depth
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2', None],
    'criterion': ['gini', 'entropy']
}

In [None]:
# searcher=RandomizedSearchCV(rfc, param_grid)
# searcher.fit(X_train_tf, y_train)

In [None]:
# joblib.dump(searcher, "RandomForestClassifier_randomizedSearchCV")

In [None]:
searcher = joblib.load("LogisticRegression_randomizedSearchCV")

In [None]:
print(searcher.best_score_)
print(searcher.best_params_)

In [None]:
# rfc = RandomForestClassifier(n_estimators=200, min_samples_split=2, min_samples_leaf=1, max_features=None, max_depth=None, criterion='gini')
# rfc.fit(X_train_tf, y_train)

# y_pred = rfc.predict(X_test_tf)

# from sklearn.metrics import classification_report

# print(classification_report(y_test, y_pred))