In [1]:
#segmentation
import nltk
nltk.download('punkt')
#Tokenize text to sentences
from nltk.tokenize import sent_tokenize
#Tokenize sentence in the text to words
from nltk.tokenize import word_tokenize
#Remove the stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
#Perfomr Steming and lemmatization
from nltk.stem.porter import PorterStemmer
nltk.download('wordnet') # download for lemmatization
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
import pandas as pd   

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\boseb\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\boseb\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\boseb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\boseb\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
import re
import spacy

# Load spaCy model and add neuralcoref to the pipeline
nlp = spacy.load('en_core_web_sm')

def standardize_text(text, print_parsed_text):
    text = text.lower()  # Lowercase
    # Segmentation
    sentences = sent_tokenize(text)
    lemmatized_all = []
    
    for sentence in sentences:
        new_text = re.sub(r"[^a-zA-Z0-9]", " ", sentence)  # Remove extra spaces
        new_text = re.sub(r"\s+", " ", new_text)  # Remove extra spaces
        new_text = re.sub(r"x+\s","", new_text) # Remove X characters
        
        # Tokenization
        words = word_tokenize(new_text)
        
        # StopWords
        words = [w for w in words if w not in stopwords.words("english")]
        
        # Lemmatization
        lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
        lemmatized_all.extend(lemmatized)
        
    unique_words = list(set(lemmatized_all))
    processed_text = ' '.join(unique_words)
    
    # Named Entity Recognition (NER)
    doc = nlp(processed_text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    
    # Dependency Parsing
    dependencies = [(token.text, token.dep_, token.head.text) for token in doc]
    if(print) :
        # Print results
        print("##################################################")
        print(f"Original Text: {text}")
        print("##################################################")
        print(f"Lemmatized Unique Words: {unique_words}")
        print("##################################################")
        print(f"Named Entities: {entities}")
        print("##################################################")
        print(f"Dependencies: {dependencies}")
    
    return processed_text



In [9]:
text = """My XXXX  instructed me to write a letter because I am a XXXX  of XXXX XXXX.
I am in a desperate need to fix this problem since I am trying to get things in order The Accounts that I put below are due to the XXXX XXXX  violation that has nearly ruined my life Please remove these accounts so I can start taking steps to getting my life back together. 
Thank you so much for assisting me and offering these resources to XXXX  like me. 
ACCOUNT THAT ARE FRAUD DUE TO XXXX XXXX  : XXXX. XXXX XXXXXXXX XXXX XXXX  opened XX/XX/XXXX XXXX. XXXX XXXXXXXX XXXX XXXX opened XX/XX/XXXX XXXX. XXXX  XXXX XXXX opened XX/XX/XXXX XXXX. The XXXX Inquiries Below | The only inquiry I did was the XXXX  XXXX XXXX in XXXX, XXXX"""

# Example usage
processed_text = standardize_text(text, True)
print("##################################################")
print(f"Processed Text: {processed_text}")

##################################################
Original Text: my xxxx  instructed me to write a letter because i am a xxxx  of xxxx xxxx.
i am in a desperate need to fix this problem since i am trying to get things in order the accounts that i put below are due to the xxxx xxxx  violation that has nearly ruined my life please remove these accounts so i can start taking steps to getting my life back together. 
thank you so much for assisting me and offering these resources to xxxx  like me. 
account that are fraud due to xxxx xxxx  : xxxx. xxxx xxxxxxxx xxxx xxxx  opened xx/xx/xxxx xxxx. xxxx xxxxxxxx xxxx xxxx opened xx/xx/xxxx xxxx. xxxx  xxxx xxxx opened xx/xx/xxxx xxxx. the xxxx inquiries below | the only inquiry i did was the xxxx  xxxx xxxx in xxxx, xxxx
##################################################
Lemmatized Unique Words: ['step', 'offering', 'put', 'desperate', 'thing', 'trying', 'together', 'write', 'since', 'account', 'fithis', 'instructed', 'letter', 'need', 'please

In [4]:
file_loc = "../Resources/ModelData/customer_complaint_data_to_process.csv"
customer_complaints_df = pd.read_csv(file_loc)
customer_complaints_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22989 entries, 0 to 22988
Data columns (total 2 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   Product                       22989 non-null  object
 1   Consumer complaint narrative  22989 non-null  object
dtypes: object(2)
memory usage: 359.3+ KB


In [5]:
customer_complaints_df.head(3)
customer_complaints_df['Consumer complaint narrative'] = customer_complaints_df['Consumer complaint narrative'].apply(standardize_text)
customer_complaints_df.head(3)

Unnamed: 0,Product,Consumer complaint narrative
0,Credit card,fault contacted someone reversed fraud receive...
1,Checking or savings account,saving reversed would give allowing marcus onl...
2,Checking or savings account,chase loan give said called order put payment ...


In [6]:
customer_complaints_df.head(3)

Unnamed: 0,Product,Consumer complaint narrative
0,Credit card,fault contacted someone reversed fraud receive...
1,Checking or savings account,saving reversed would give allowing marcus onl...
2,Checking or savings account,chase loan give said called order put payment ...


In [7]:
customer_complaints_df.to_csv("../Resources/ModelData/train_test_data.csv", index = False)