In [None]:
# Import libraries
import pandas as pd
from unidecode import unidecode
from datetime import datetime
import string
import nltk
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import spacy
from spacy.lang.fr.stop_words import STOP_WORDS
import re

# Download NLTK data if needed
# nltk.download()

# Initialize French language model and stopwords
nlp = spacy.load("fr_core_news_sm")
nlp.max_length = 50000000

# Import and process data


In [None]:
crm_cols = ["DATE_CONTACT", "CODIF_NIVEAU_1", "CODIF_NIVEAU_2", "COMMENTAIRES_AGENT"]
crm = pd.read_csv(
    "./datas/V_CODIF_CRM.txt", on_bad_lines="skip", sep="\t", usecols=crm_cols
)

# Convert columns into the right format
crm["DATE_CONTACT"] = pd.to_datetime(crm["DATE_CONTACT"])
crm["year_month"] = crm["DATE_CONTACT"].dt.strftime("%Y-%m")

crm.sort_values(by=["DATE_CONTACT"], inplace=True)
crm.reset_index(inplace=True)

# Normalization and processing of comments
crm["COMMENTAIRES_AGENT"] = crm["COMMENTAIRES_AGENT"].apply(unidecode)
crm["COMMENTAIRES_AGENT"] = crm["COMMENTAIRES_AGENT"].str.lower()

# Remove STOP words
stopwords_worldbrain = pd.read_csv(
    "./words_lists/STOP_WORDS_WordlBrain_updated.txt", header=None
)
stopwords_worldbrain = stopwords_worldbrain[0].tolist()
crm["COMMENTAIRES_processed"] = crm["COMMENTAIRES_AGENT"].apply(
    lambda x: " ".join(
        [
            word
            for word in x.split()
            if word.lower() and word not in stopwords_worldbrain
        ]
    )
)

# Remove COMMON words
remove_words = (
    pd.read_csv("./words_lists/remove_words.csv").dropna()["word_to_remove"].tolist()
)
full_words_remove = stopwords_worldbrain + remove_words

# Convert columns into the right format
crm[
    ["CODIF_NIVEAU_1", "CODIF_NIVEAU_2", "COMMENTAIRES_AGENT", "COMMENTAIRES_processed"]
] = crm[
    ["CODIF_NIVEAU_1", "CODIF_NIVEAU_2", "COMMENTAIRES_AGENT", "COMMENTAIRES_processed"]
].astype(
    str
)
crm["DATE_CONTACT"] = pd.to_datetime(crm["DATE_CONTACT"])
crm["year_month"] = pd.to_datetime(crm["year_month"])

# Export processed data to CSV (if needed)
# crm.to_csv("./datas/datas_processed.csv", index=False, header=True)


# Import processed dataset


In [None]:
crm = pd.read_csv(".\datas\datas_processed.csv")


# Wordclouds: raw datas vs. processed datas (period of 6 months)


In [None]:
start_date = datetime(2022, 4, 25).strftime("%Y-%m-%d")
end_date = datetime(2022, 10, 25).strftime("%Y-%m-%d")
mask = (crm["DATE_CONTACT"] > start_date) & (crm["DATE_CONTACT"] <= end_date)

## Raw datas


In [None]:
crm_raw = crm.loc[mask]
column_str_raw = crm_raw["COMMENTAIRES_AGENT"].str.cat(sep=", ")
column_str_raw = nltk.word_tokenize(column_str_raw)
column_str_raw = [t.lower() for t in column_str_raw if t not in string.punctuation]
column_str_raw = ", ".join(column_str_raw)

## Process datas


In [None]:
crm_preprocessed = crm.loc[mask]
column_str_process = crm_preprocessed["COMMENTAIRES_processed"].str.cat(sep=", ")
column_str_process = nltk.word_tokenize(column_str_process)
column_str_process = [
    t.lower() for t in column_str_process if t not in string.punctuation
]
column_str_process = [
    word for word in column_str_process if word not in full_words_remove
]
column_str_process = [unidecode(x) for x in column_str_process]
column_str_process = ", ".join(column_str_process)

# Lemmatization
lemmas_process = [t.lemma_ for t in nlp(column_str_process)]
lemmas_process = ", ".join(lemmas_process)

## Plot raw vs. processed


In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 10))
axes[0].imshow(
    WordCloud(
        width=1000,
        height=500,
        background_color="white",
        colormap="viridis",
        max_words=100,
        random_state=30,
    ).generate(column_str_raw)
)
axes[0].set_title("RAW", fontsize=18, pad=10)
axes[0].axis("off")

axes[1].imshow(
    WordCloud(
        width=1000,
        height=500,
        background_color="white",
        stopwords=stopwords_worldbrain,
        colormap="viridis",
        max_words=100,
        random_state=30,
    ).generate(lemmas_process)
)
axes[1].set_title("Stopwords, Lemmatized and Common words", fontsize=18, pad=10)
axes[1].axis("off")

plt.show()

# Codif Niveau 1 (full period)


In [None]:
crm_codif1 = crm.copy()
crm_alerting = crm_codif1[crm_codif1["CODIF_NIVEAU_1"] == "Boîtier Alerting"]
column_str_alerting = crm_alerting["COMMENTAIRES_AGENT"].str.cat(sep=", ")
column_str_alerting = nltk.word_tokenize(column_str_alerting)
column_str_alerting = [
    t.lower() for t in column_str_alerting if t not in string.punctuation
]
column_str_alerting = [
    word for word in column_str_alerting if word not in full_words_remove
]
column_str_alerting = [unidecode(x) for x in column_str_alerting]
column_str_alerting = ", ".join(column_str_alerting)
lemmas_alerting = [t.lemma_ for t in nlp(column_str_alerting)]
lemmas_alerting = ", ".join(lemmas_alerting)

crm_secure = crm_codif1[crm_codif1["CODIF_NIVEAU_1"] == "Coyote Secure"]
column_str_secure = crm_secure["COMMENTAIRES_AGENT"].str.cat(sep=", ")
column_str_secure = nltk.word_tokenize(column_str_secure)
column_str_secure = [
    t.lower() for t in column_str_secure if t not in string.punctuation
]
column_str_secure = [
    word for word in column_str_secure if word not in full_words_remove
]
column_str_secure = [unidecode(x) for x in column_str_secure]
column_str_secure = ", ".join(column_str_secure)
lemmas_secure = [t.lemma_ for t in nlp(column_str_secure)]
lemmas_secure = ", ".join(lemmas_secure)

## Plot codif1


In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 10))
axes[0].imshow(
    WordCloud(
        width=1000,
        height=500,
        background_color="white",
        stopwords=full_words_remove,
        colormap="viridis",
        max_words=100,
        random_state=30,
        collocations=False,
    ).generate(lemmas_alerting)
)
axes[0].set_title("Alerting", fontsize=18, pad=10)
axes[0].axis("off")

axes[1].imshow(
    WordCloud(
        width=1000,
        height=500,
        background_color="white",
        stopwords=full_words_remove,
        colormap="viridis",
        max_words=100,
        random_state=30,
        collocations=False,
    ).generate(lemmas_secure)
)
axes[1].set_title("Secure", fontsize=18, pad=10)
axes[1].axis("off")

plt.show()

# Codif Niveau 2 (period of 1 year)


In [None]:
crm_codif2 = crm.copy()

codif2 = [
    "FACTURATION",
    "VENTE",
    "Fidélisation",
    "ASSISTANCE TECHNIQUE (TELEDEPANNAGE)",
    "GESTION DE COMPTE",
    "RESILIATION",
    "AUTRE",
]

crm_codif2.loc[~crm_codif2["CODIF_NIVEAU_2"].isin(codif2), "CODIF_NIVEAU_2"] = "AUTRE"

codif2 = [
    x.replace(" ", "_").replace("(", "").replace(")", "").replace("é", "e")
    for x in codif2
]

crm_codif2["CODIF_NIVEAU_2"] = crm_codif2["CODIF_NIVEAU_2"].apply(
    lambda x: x.replace(" ", "_").replace("(", "").replace(")", "").replace("é", "e")
)

start_date = datetime(2021, 10, 25).strftime("%Y-%m-%d")
end_date = datetime(2022, 10, 25).strftime("%Y-%m-%d")
mask = (crm["DATE_CONTACT"] > start_date) & (crm["DATE_CONTACT"] <= end_date)
crm_codif2 = crm_codif2.loc[mask]

for x in codif2:
    vars()[f"df_{x}"] = crm_codif2["comms_sw"][(crm_codif2["CODIF_NIVEAU_2"] == x)]
    vars()[f"str_{x}"] = vars()[f"df_{x}"].str.cat(sep=", ")
    vars()[f"str_{x}"] = unidecode(vars()[f"str_{x}"])
    vars()[f"words_{x}"] = word_tokenize(
        vars()[f"str_{x}"], preserve_line=True, language="french"
    )
    vars()[f"words_no_punct_{x}"] = [
        word.lower() for word in vars()[f"words_{x}"] if word.isalpha()
    ]
    vars()[f"clean_words_{x}"] = [
        i for i in vars()[f"words_no_punct_{x}"] if i not in full_words_remove
    ]
    vars()[f"lemm_words_{x}"] = [
        WordNetLemmatizer().lemmatize(word) for word in vars()[f"clean_words_{x}"]
    ]
    vars()[f"lemm_words_{x}"] = ", ".join(vars()[f"lemm_words_{x}"])

## Plot codif2


In [None]:
fig, axes = plt.subplots(len(codif2), 1, figsize=(15, 40))
count = 0

for x in codif2:
    count += 1
    axes[count - 1].imshow(
        WordCloud(
            width=1000,
            height=500,
            background_color="white",
            stopwords=remove_words,
            collocations=False,
            colormap="viridis",
            max_words=100,
            random_state=30,
        ).generate(vars()[f"lemm_words_{x}"])
    )
    axes[count - 1].set_title(f"{x}", fontsize=18, pad=5)
    axes[count - 1].axis("off")

plt.show()

# WordCloud monthly


In [None]:
list_months = [
    "2021-02",
    "2021-03",
    "2021-04",
    "2021-05",
    "2021-06",
    "2021-07",
    "2021-08",
    "2021-09",
    "2021-10",
    "2021-11",
    "2021-12",
    "2022-01",
    "2022-02",
    "2022-03",
    "2022-04",
    "2022-05",
    "2022-06",
    "2022-07",
    "2022-08",
    "2022-09",
    "2022-10",
]

stopwords_worldbrain = pd.read_csv(
    "./my_words_lists/STOP_WORDS_WordlBrain_updated.txt", header=None
)
stopwords_worldbrain = stopwords_worldbrain[0].tolist()
crm["commentaires_stopwords"] = crm["COMMENTAIRES_AGENT"].apply(
    lambda x: " ".join([word for word in x.split() if word not in stopwords_worldbrain])
)

remove_words = (
    pd.read_csv("./my_words_lists/remove_words.csv").dropna()["word"].tolist()
)
crm["commentaires_stopwords"] = crm["commentaires_stopwords"].apply(
    lambda x: " ".join([word for word in x.split() if word not in remove_words])
)
crm["commentaires_stopwords"] = crm["commentaires_stopwords"].apply(unidecode)

## Plot monthly

In [None]:
fig, axes = plt.subplots(1, len(list_months), figsize=(120, 6))
plt.suptitle("WordClouds monthly", fontsize=24)
count = 0

for text in list_months:
    vars()[text] = crm["commentaires_stopwords"][(crm["year_month"] == text)]
    vars()[text] = [text.strip() for text in vars()[text]]
    vars()[text] = " ".join(vars()[text])
    vars()[text] = re.sub(
        r"[!\"#$%&()*+,-./:;<=>?@[\\\]^_`{|}~]+", " ", vars()[text]
    ).lower()
    vars()[text] = nlp(vars()[text][:])
    vars()[text] = [token.lemma_ for token in vars()[text]]
    vars()[text] = " ".join(vars()[text])

    count += 1
    axes[count - 1].imshow(
        WordCloud(
            width=1000,
            height=500,
            background_color="white",
            stopwords=stopwords_worldbrain,
            colormap="viridis",
            max_words=100,
            random_state=30,
        ).generate(vars()[text])
    )
    axes[count - 1].set_title(f"{text}", fontsize=18, pad=5)
    axes[count - 1].axis("off")

plt.show()

# Extract words frequency


In [None]:
counts = dict()

for word in lemm_words:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

counts = {key: val for key, val in counts.items() if val > 10}

with open(f"./words_count.csv", "w", errors="ignore") as f:
    for key in counts.keys():
        f.write("%s, %s\n" % (key, counts[key]))

counting = crm["commentaires_stopwords"]
counting = [text.strip() for text in counting]
counting = " ".join(counting)
counting = re.sub(r"[!\"#$%&()*+,-./:;<=>?@[\\\]^_`{|}~]+", " ", counting).lower()

# Frequency of each word in string
counts = dict()
words = str.split(counting)
for word in words:
    if word in counts:
        counts[word] += 1
    else:
        counts[word] = 1

# Clean my dict of counting
counts = {key: val for key, val in counts.items() if val > 10}

# Convert dictionary into a CSV
with open(f"./words_lists/words_count.csv", "w", errors="ignore") as f:
    for key in counts.keys():
        f.write("%s, %s\n" % (key, counts[key]))