In [None]:
import pandas as pd
import imblearn
import re
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from pandas import DataFrame
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from numpy import dtype
import warnings 
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from scipy import sparse
from sklearn import preprocessing

In [None]:
!conda install -c conda-forge imbalanced-learn
!conda install -c conda-forge/label/gcc7 imbalanced-learn
!conda install -c conda-forge/label/cf201901 imbalanced-learn
!conda install -c conda-forge imblearn

In [None]:
nltk.download('punkt')

In [None]:
file_folder = 'Issue_MOCK_Data_v0 - Sheet1.csv'

In [None]:
df = pd.read_csv(file_folder)

In [None]:
df.info(2)

In [None]:
total_count, total_cols = df.shape
print("Dataframe contains {} observations across {} columns".format(total_count, total_cols))

In [None]:
df.groupby('LABEL').size()

In [None]:
df.isna().sum()

In [None]:
df['LABEL'] = df['LABEL'].fillna(value="NOT_DEFINED")

In [None]:
df['REMEDIATION_SUMMARY'] = df['REMEDIATION_SUMMARY'].fillna(value="")

In [None]:
df['LEGACY_ID'] = df['LEGACY_ID'].fillna(value="")

In [None]:
df.isna().sum()

In [None]:
df['ISSUE_TEXT'] = df['ISSUE_TITLE'] + df['ISSUE_SUMMARY']

In [None]:
df = df.drop(['REMEDIATION_SUMMARY'], axis=1)

In [None]:
df = df.drop(['ADDED_BY_KATYA'], axis=1)

In [None]:
encoded_labels = pd.get_dummies(df['LABEL'])

In [None]:
df = pd.concat([df, encoded_labels], axis = 1)
df.info()

### Defining Text Pre-Processing

In [None]:
stop_words = set(stopwords.words('english'))

def concatenate_list_data(lst):
    result = ''
    for element in lst:
        result += str(element)
        result += ' '
    return result

def pre_process(text, legacy):

    filtered_words = ['']*len(text)
    stemmer = WordNetLemmatizer
    
    text.strip("/")
    legacy.strip("/")

    document = re.sub(legacy, '', str(text))
    noisy_words = ["IA-AUD-", "POL-", "FNM-", "CTL-", "-OE", "SOX-", "-CR-", "IA-", 
                   "-MO-", "-TC-", "-ERM-", "-CO-", "-AOR-", "-MF-", "-NR", "-CR-", 
                   "-CPM", "-OCA-", "-DER-"]
    for word in noisy_words:
        document = re.sub(word,'',document)

    document = re.sub(r'\W', ' ', document)
    document = re.sub("_", "", document)
    document = re.sub("-", "", document)
    document = ''.join([i for i in document if not i.isdigit()])
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    document = re.sub(r'^b\s+', '', document)
    document = document.lower()

    word_tokens = word_tokenize(document)
    filtered_words = [w for w in word_tokens if not w in stop_words]
    document = concatenate_list_data(filtered_words)

    #document = document.split()

    #document = [stemmer.lemmatize(word) for word in document]
    #document = ' '.join(document)

    return document

In [None]:
text = 'Mauris lacinia sapien quis libero.'
legacy = ''

text.strip("/")
legacy.strip("/")

document = re.sub(legacy, '', str(text))
noisy_words = ["IA-AUD-", "POL-", "FNM-", "CTL-", "-OE", "SOX-", "-CR-", "IA-", 
               "-MO-", "-TC-", "-ERM-", "-CO-", "-AOR-", "-MF-", "-NR", "-CR-", 
               "-CPM", "-OCA-", "-DER-"]
for word in noisy_words:
    document = re.sub(word,'',document)

document = re.sub(r'\W', ' ', document)
document = re.sub("_", "", document)
document = re.sub("-", "", document)
document = ''.join([i for i in document if not i.isdigit()])
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
document = re.sub(r'\s+', ' ', document, flags=re.I)
document = re.sub(r'^b\s+', '', document)
document = document.lower()

word_tokens = word_tokenize(document)
filtered_words = [w for w in word_tokens if not w in stop_words]
document = concatenate_list_data(filtered_words)

#document = document.split()
#print(document)
#document = [stemmer.lemmatize(w) for w in document]
#document = ' '.join(document)

In [None]:
#X = df.ISSUE_TEXT
#y = encoded_labels
#X_train, X_test, y_train, y_test = train_test_split (
#                                    X, y, test_size = 0.2, random_state = 42,
#                                    stratify = encoded_labels)

In [None]:
#df.apply(lambda x: pre_process(x['ISSUE_TEXT'], x['LEGACY_ID']))

In [None]:
pre_processed_text = ['']*len(df)
for i in range(df.ISSUE_NUMBER.count()):
    pre_processed_text[i] = pre_process(df.ISSUE_TEXT[i], df.LEGACY_ID[i])
pre_processed_text[0]

In [None]:
le = preprocessing.LabelEncoder()
le.fit(df.LABEL)
le.classes_
normalized_label = le.transform(df.LABEL)

In [None]:
X = pre_processed_text
#labels = ['BRR', 'CSR']
#for label in labels:
#    y = df[label]
    

In [None]:
y = df['BRR']

In [None]:
X_train, X_test, y_train, y_test = train_test_split (
                                    X, y, test_size = 0.2, random_state = 42,
                                    stratify = normalized_label)
y_train

In [None]:
tfidf_vect = TfidfVectorizer(max_df = 0.85, stop_words = stopwords.words('english'))
xtrain_tfidf = tfidf_vect.fit_transform(X_train).toarray()
xtest_tfidf = tfidf_vect.transform(X_test).toarray()

In [None]:
xtrain_tfidf.shape, xtest_tfidf.shape

In [None]:
sm = SMOTE(random_state = 42)
sm_xtrain_tfidf, sm_train_y = sm.fit_sample(xtrain_tfidf, y_train)

In [None]:
np.random.seed(42)
best_val_score = 0
best_clf = None
for clf in [LogisticRegression, LinearSVC, SGDClassifier]:
    clf_pipeline = Pipeline(
                            [
                                #("vectorizer", TfidfVectorizer(max_df = 0.85, 
                            #                                stop_words = stopwords.words('english'))),
                                ("classifier", clf())
                            ]
                            )
    val_score = cross_val_score(clf_pipeline, sm_xtrain_tfidf, sm_train_y).mean()
    print(clf)
    print(val_score)
    if val_score > best_val_score:
        best_val_score = val_score
        best_clf = clf
    print("\n")
    
print('Best:')
print(best_val_score)
print(best_clf)

In [None]:
best_clf_pipeline = Pipeline(
                            [
                                 ("classifier", best_clf())
                            ]
                        )
best_clf_pipeline.fit(xtrain_tfidf, y_train)

In [None]:
Actual_Risk_Type = y_test.tolist()
Predicted_Risk_Type = (best_clf_pipeline.predict(xtest_tfidf))
matches = 0
for i in range (len(xtest_tfidf)):
    if(Actual_Risk_Type[i] == Predicted_Risk_Type[i]):
        matches+=1
Accuracy = round(matches/len(xtest_tfidf), 2)
print("{} is the model accuracy".format(Accuracy))
print(Actual_Risk_Type)
print(Predicted_Risk_Type)

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(Actual_Risk_Type, Predicted_Risk_Type, digits=3))