In [1]:
# Libraries for data loading, data viz and EDA
import json 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Libraries for text preprocessing and analysis
import re,nltk,spacy,string
nlp=spacy.load("en_core_web_sm")
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.decomposition import NMF
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS

# Libraries for model evaluation metrics
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, f1_score, classification_report

# Remove warnings
import warnings
warnings.filterwarnings('ignore')

# row/column display limit
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
df = pd.read_excel('cisco_from_db.xlsx')

In [3]:
df.head()

Unnamed: 0,incident_number,uncleaned_description,cleaned_description,cluster_number,predicted_tag,userdefined_tag,score
0,INC2617296,Channel Overlay Debooking of Mio against NTT Germany needs to be corrected,channel overlay debooking against ntt germany needs to corrected,,report_requests,,49
1,INC2617297,Error Image Pull Back off Issue for Prod deployment,error image pull back off issue prod deployment,,deployment_related,,100
2,INC2617298,Gate Failed error for deployment,gate failed error deployment,,deployment_related,deployment_related,100
3,INC2617309,Hi Team Please note that Quote is in conversion failed status However there is no hold on SO Kindly provide us workaround and reply at the earliest Regards Imran S,hi team please note quote in conversion failed status however no hold on so provide us workaround reply earliest regards imran s,,storage_related,,89
4,INC2617312,Aurora Development Service ADS My system is down or something is broken Please help Enter Hostname bgl ads Please enter a detailed problem descriptio,aurora development service ads system down something broken please help enter hostname enter detailed problem descriptio,,Infra_related_ads,Infra_related_ads,100


In [4]:
df.shape

(41917, 7)

In [5]:
# Write your function here to clean the text and remove all the unnecessary elements.
def clean_texts(text):
    #Make the text lowercase
    text=text.lower()
    
    #Remove text in square brackets
    text=re.sub(r'\[.*?\]','',text)
    
    #Remove punctuation
    text=re.sub(r'[%s]%re.escape(string.punctuation)','',text)
    
    #Remove words containing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    return text

In [6]:
df = df[df['uncleaned_description'].notna()]

In [7]:
#Cleaning df['complaint_what_happened']
df['cleansed_description']= df['uncleaned_description'].apply(lambda x: clean_texts(x))

In [9]:
df.head()

Unnamed: 0,incident_number,uncleaned_description,cleaned_description,cluster_number,predicted_tag,userdefined_tag,score,cleansed_description
0,INC2617296,Channel Overlay Debooking of Mio against NTT Germany needs to be corrected,channel overlay debooking against ntt germany needs to corrected,,report_requests,,49,channel overlay debooking of mio against ntt germany needs to be corrected
1,INC2617297,Error Image Pull Back off Issue for Prod deployment,error image pull back off issue prod deployment,,deployment_related,,100,error image pull back off issue for prod deployment
2,INC2617298,Gate Failed error for deployment,gate failed error deployment,,deployment_related,deployment_related,100,gate failed error for deployment
3,INC2617309,Hi Team Please note that Quote is in conversion failed status However there is no hold on SO Kindly provide us workaround and reply at the earliest Regards Imran S,hi team please note quote in conversion failed status however no hold on so provide us workaround reply earliest regards imran s,,storage_related,,89,hi team please note that quote is in conversion failed status however there is no hold on so kindly provide us workaround and reply at the earliest regards imran s
4,INC2617312,Aurora Development Service ADS My system is down or something is broken Please help Enter Hostname bgl ads Please enter a detailed problem descriptio,aurora development service ads system down something broken please help enter hostname enter detailed problem descriptio,,Infra_related_ads,Infra_related_ads,100,aurora development service ads my system is down or something is broken please help enter hostname bgl ads please enter a detailed problem descriptio


In [8]:
#Write your function to Lemmatize the texts
def lemma_texts(text):     
        
    # Initialize empty list to store lemmas
    lemma_list = []
    
    # Extract lemmas of given text and add to the list 'sent'
    document = nlp(text)
    for word in document:
        lemma_list.append(word.lemma_)
        
    # return string converted form of the list of lemmas
    return " ".join(lemma_list)

In [9]:
df["lemmatized_descriptipon"] =  df.apply(lambda x: lemma_texts(x['cleansed_description']), axis=1)

# View the dataframe
df.head()