In [1]:
import pandas as pd
import numpy as np
import os, sys
import re, string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample


dev_loc = r'/axp/buanalytics/csswcpfwt/dev/'

In [2]:
df = pd.read_csv(r'/axp/buanalytics/csswcpfwt/dev/rally_extract.txt', header=None, index_col=None, delimiter=r'\x01', engine='python')
df.columns = ['featr_frmt_id','featr_nm','featr_desc','featr_note','PFWT','drct_child_ct','story_frmt_id','story_nm','story_ds','acpt_crit','story_note']
df.head(1)

Unnamed: 0,featr_frmt_id,featr_nm,featr_desc,featr_note,PFWT,drct_child_ct,story_frmt_id,story_nm,story_ds,acpt_crit,story_note
0,F20,UCID - SD,ADW IVR,,,18,US123881,Tech Task - Merge R2E Defect - Score card fu...,Should able to authenticate with scorecard whe...,,


In [3]:
df_filtered = df[['featr_nm','featr_desc','story_nm','story_ds','PFWT']]
df_filtered = df_filtered.dropna(how='any')
df_filtered = df_filtered.drop_duplicates()
print(len(df), len(df_filtered)); #print(df_filtered.head())

indexNames = df_filtered[df_filtered['PFWT'] =="Select for Theme, Epic or Capability" ].index
df_filtered.drop(indexNames , inplace=True)

df_filtered['PFWT'].value_counts()

716471 292834


New Application Development / New App Dev Testing    205253
Other Non-Application Development activities          75344
Research & Development (R&D)                          10179
Name: PFWT, dtype: int64

In [4]:
# #handling imbalanced data
df1 = df_filtered[df_filtered['PFWT']=='New Application Development / New App Dev Testing']
df2 = df_filtered[df_filtered['PFWT']=='Other Non-Application Development activities']
df3 = df_filtered[df_filtered['PFWT']=='Research & Development (R&D)']

df1 = resample(df1, replace=False, n_samples=25000, random_state=123)
df2 = resample(df2, replace=False, n_samples=15000, random_state=123)
df3 = resample(df3, replace=False, n_samples=10000, random_state=123)

df_data = pd.concat([df1, df2, df3])
df_data['PFWT'].value_counts()

New Application Development / New App Dev Testing    25000
Other Non-Application Development activities         15000
Research & Development (R&D)                         10000
Name: PFWT, dtype: int64

In [5]:
def clean_tags(txt):
    txt=str(txt)
    txt=txt.strip();txt=txt.lower()
    txt=txt.replace('>','> ')
    txt=txt.replace('<',' <')
    txt=re.sub("[\<\[].*?[\>\]]", "", txt)
    return txt
#pre_process("<div>EWT</div><div>EDIS</div><div>Merchant Rec...")

#define all stopwords
def stopwords_list():
    stop_words_list = set(stopwords.words('english'))
    stop_words_list.update(('and','a','so','arnt','this','when','It','many','so','cant','yes'))
    stop_words_list.update(('no','these','these','please', 'let', 'know', 'cant', 'can', 'pls', 'u', 'abt', 'wht'))
    return stop_words_list

stop_words_list = stopwords_list()

#function to clean the input - unwanted data/text can be removed
def clean_text(text):  
    norm_text = text.lower()
    #remove use case specific keywords
    #norm_text = norm_text.replace('end report', ' ')         
    for char in ['\"', ',', '(', ')', '!', '?', ';', ':', '#', '*', '>','$']:
        norm_text = norm_text.replace(char, ' ' + char + ' ')
            
    norm_text = norm_text.replace('<U1177324', ' ')
    norm_text = norm_text.replace(' &nbsp',' ')
    norm_text = norm_text.replace('&nbsp',' ')
    norm_text = re.sub(r"\\", "", norm_text)    
    norm_text = re.sub(r"\'", "", norm_text)    
    norm_text = re.sub(r"\"", "", norm_text)  
    
    #clear number labels
    norm_text = re.sub('[0-9]{1,2}[.]', ' ', norm_text).strip() 
    #remove 1/20:
    norm_text = re.sub('[0-9]{1,2}[/][0-9]{1,2}[:]', ' ', norm_text).strip()
    #remove numbers
    norm_text = re.sub('[0-9]{1,2}[ ]', '', norm_text).strip()
    norm_text = re.sub('(\d{1,3}(?:\s*\d{3})*(?:,\d+)?)', ' ', norm_text).strip()

    #clear date
    norm_text = re.sub('[0-9]{1,2}[\/,:][0-9]{1,2}[\/,:][0-9]{2,4}', ' ', norm_text).strip() 
    return norm_text

#lemmatize and remove stop words
def lemmatize_text(text, stopwords_remove= True):
    lemmatizer = WordNetLemmatizer()
    if(stopwords_remove):
        resp = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words_list if word not in string.punctuation if word.isalpha()]
    else:#without stopword removal
        resp = [lemmatizer.lemmatize(word) for word in text.split() if word not in string.punctuation if word.isalpha()]
    return " ".join(resp)

# Master function to convert text to lower-case and strip punctuation/symbols from words
def normalize_text(text, stopwords_remove = True):
    out1 = clean_tags(text); #print(1, out1)
    out2 = clean_text(out1); #print(2, out2)
    out3 = lemmatize_text(out2, stopwords_remove); #print(3, out3)
    return out3

# #test input
# normalize_text("<ff> dshfksj<> kajsflksuoiew sd325325")

for series in df_data.columns:
    if series == 'PFWT': continue
    df_data[series] = df_data[series].apply(normalize_text); print("completed--->", series)
    
#save processed data
df_data.to_csv(dev_loc+"/processed_data.csv", index=False)


completed---> featr_nm
completed---> featr_desc
completed---> story_nm
completed---> story_ds
